Beispiel #1
0
    def experiment_explore_vs_exploit(meta_path, save_directory, setups, episodes, steps):    

        alpha = 0.0001
        basis_order = 3

        env = gym.make('CartPole-v0')
        env.reset()

        (obs, reward, done, info) = env.step(env.action_space.sample())
        obs = EnvWrapper.modified_sigmoid(obs)
        phi = fourier_basis(obs, order=basis_order)

        num_features = phi.shape[0]
        num_actions = env.action_space.n

        if os.path.isdir(save_directory) == False:
            os.mkdir(save_directory)

        num_samples = 5

        meta = pickle.load(open(meta_path, "rb")) 

        agents = []
        for setup in setups:
            gym_env = gym.make('CartPole-v0')
            gym_env.env.force_mag = setup["force"]
            gym_env.env.length = setup["pole_length"]
            gym_env.env.masscart = setup["masscart"]
            gym_env.env.masspole = setup["masspole"]
            
            env = EnvWrapper(gym_env, basis_order=basis_order, normalization=0)
            agent = LinearAgent(env, meta_policy=meta, alpha=alpha, algo="SARSA")
            

            agents.append( agent )

        policies = []
        for agent in agents:
            rewards = agent.train(num_episodes=episodes, max_steps=steps, verbose=True, update_meta=False, render=False)
            policies.append( copy.deepcopy(agent.learning_algorithm) )


        rewards = []        
        for i, agent in enumerate(agents):
            agent.learning_algorithm = policies[i]
            agent.random_action_prob = 0.0
            agent.RANDOM_ACTION_DECAY = 1.0
            exploit_rewards = agent.train(num_episodes=episodes, max_steps=steps, verbose=True, update_meta=False, render=False)

            agent.random_action_prob = 1.0
            explore_rewards = agent.train(num_episodes=episodes, max_steps=steps, verbose=True, update_meta=False, render=False)


            rewards.append( {"explore" : explore_rewards, "exploit" : exploit_rewards} )

        pickle.dump(rewards, open(save_directory+"/explore_exploit.pkl", "wb"))
Beispiel #2
0
phi = fourier_basis(obs, order=basis_order)

num_features = phi.shape[0]  # + len( cartpole_setup[0].keys() )
num_actions = env.action_space.n

# meta = MetaPolicy(num_features=num_features, num_actions=num_actions)

mazes = ["maze5.txt", "maze6.txt", "maze7.txt"]

for m in mazes:
    gym_env = AnimatEnv("./CustomEnvironments/" + m)

    env = EnvWrapper(gym_env, basis_order=basis_order, normalization=1)
    agent = LinearAgent(env,
                        meta_policy=None,
                        alpha=alpha,
                        beta=beta,
                        algo="PPO")
    agent.learning_algorithm.t_length = 8
    agent.learning_algorithm.update_steps = 64
    agent.learning_algorithm.epochs = 4
    agent.learning_algorithm.batch_size = 16

    dir = "./AnimatPPOEvalNoMeta/" + m.split(".")[0] + "/"
    # agent.random_action_prob = 0.0
    rewards = agent.train(num_episodes=500,
                          max_steps=800,
                          verbose=True,
                          update_meta=False,
                          render=False,
                          save_path=dir)
    def experiment_meta_vs_random(meta_actor, meta_critic, save_directory,
                                  xml_models, episodes, steps):

        alpha = 1e-3
        beta = 1e-2

        env = gym.make("RoboschoolInvertedPendulum-v1")
        env.reset()

        (obs, reward, done, info) = env.step(env.action_space.sample())

        num_features = obs.shape[0]
        num_actions = env.action_space.low.shape[0]

        if os.path.isdir(save_directory) == False:
            os.mkdir(save_directory)

        num_samples = 5

        for k in range(1):
            if k == 0:
                meta = MetaPolicy(num_features=num_features,
                                  num_actions=num_actions,
                                  algo="PPO",
                                  alpha=1e-3,
                                  beta=1e-3,
                                  env=env)
                meta.learning_algorithm.load_model(meta_actor, meta_critic)
            else:
                meta = None

            agents = []
            for model in xml_models:
                domain_agents = []

                for _ in range(num_samples):
                    env = gym.make("RoboschoolInvertedPendulum-v1")
                    env.env.model_xml = model

                    agent = LinearAgent(env,
                                        meta_policy=meta,
                                        algo="PPO",
                                        alpha=alpha,
                                        beta=beta)
                    agent.learning_algorithm.t_length = 16
                    agent.learning_algorithm.update_steps = 64
                    agent.learning_algorithm.epochs = 8

                    domain_agents.append(agent)

                agents.append(domain_agents)

            domain_rewards_by_episode = {}
            for ep in range(episodes):
                null_actions = {}
                trajectories_by_domain = ExperimentsInvertedPendulum._run_episode(
                    domain_agents=agents, num_steps=steps, optimize_meta=False)

                domain_samples = []
                for i in trajectories_by_domain.keys():
                    sample_returns = []
                    for j in range(len(trajectories_by_domain[i])):

                        sample_returns.append(sum(
                            trajectories_by_domain[i][j]))

                    domain_samples.append(sample_returns)

                print("Episode %d / %d" % (ep, episodes))
                domain_rewards_by_episode[ep] = domain_samples

                if ep % 100 == 0:
                    filename = "meta_test_" + str(
                        ep) + ".pkl" if k == 0 else "no_meta_test_" + str(
                            ep) + ".pkl"
                    pickle.dump(domain_rewards_by_episode,
                                open(save_directory + "/" + filename, "wb"))
                    for ai, a in enumerate(agents):
                        type_ = "meta_" if k == 0 else "no_meta_"
                        type_ += str(ai) + "_"
                        a[0].learning_algorithm.save_model(
                            save_directory + "/" + type_, ep)
    def experiment_train_meta(save_directory, meta_alpha, meta_beta,
                              xml_models):

        alpha = 1e-3
        beta = 1e-2

        env = gym.make("RoboschoolInvertedPendulum-v1")
        env.reset()

        (obs, reward, done, info) = env.step(env.action_space.sample())

        num_features = obs.shape[0]
        num_actions = env.action_space.low.shape[0]

        meta = MetaPolicy(num_features=num_features,
                          num_actions=num_actions,
                          algo="PPO",
                          alpha=meta_alpha,
                          beta=meta_beta,
                          env=env)
        meta.learning_algorithm.t_length = 32
        meta.learning_algorithm.update_steps = 64

        if os.path.isdir(save_directory) == False:
            os.mkdir(save_directory)

        meta_train_episodes = 30
        num_samples = 4
        for k in range(meta_train_episodes):
            agents = []

            print("Loading environments...")
            trial_by_domain = {}
            for i, model in enumerate(xml_models):

                domain_agents = []
                for _ in range(num_samples):
                    env = gym.make("RoboschoolInvertedPendulum-v1")
                    env.env.model_xml = model

                    agent = LinearAgent(env,
                                        meta_policy=meta,
                                        algo="PPO",
                                        alpha=alpha,
                                        beta=beta)

                    domain_agents.append(agent)

                    agent.learning_algorithm.t_length = 16
                    agent.learning_algorithm.update_steps = 64
                    agent.learning_algorithm.epochs = 8

                agents.append(domain_agents)

                trial_by_domain[i] = [list() for _ in range(num_samples)]
            print("Done loading...")

            episodes = 250
            steps = 500
            domain_rewards_by_episode = {}
            for ep in range(episodes):

                trajectories_by_domain = ExperimentsInvertedPendulum._run_episode(
                    domain_agents=agents, num_steps=steps, r_maxs=None)

                domain_samples = []
                for i in trajectories_by_domain.keys():
                    sample_returns = []
                    for j in range(len(trajectories_by_domain[i])):

                        sample_returns.append(sum(
                            trajectories_by_domain[i][j]))

                    domain_samples.append(sample_returns)

                print("Episode %d - Trial %d" % (ep, k))
                domain_rewards_by_episode[ep] = domain_samples

                if ep % 100 == 0:
                    val = (k * episodes) + ep
                    meta.learning_algorithm.save_model(save_directory + "/",
                                                       val)

                    # pickle.dump(meta, open(save_directory+"/meta_iter_"+str(k)+".pkl", "wb"))
                    pickle.dump(
                        domain_rewards_by_episode,
                        open(
                            save_directory + "/trajectory_iter_" + str(val) +
                            ".pkl", "wb"))
import time

from PIL import Image

if __name__ == "__main__":
    usage = "usage: %prog [options] arg"
    parser = OptionParser(usage)
    parser.add_option("-e", "--environment", action="store", help="environment name", type="string", default="animat")
    parser.add_option("-m", "--model", action="store", help="env model", type="string")
    parser.add_option("-a", "--actor", action="store", help="agent actor", type="string")
    parser.add_option("-c", "--critic", action="store", help="agent critic", type="string")
    parser.add_option("-s", "--save_path", action="store", help="path for saving vid", type="string", default=None)

    (options, args) = parser.parse_args()

    env = options.environment

    if env != "animat":
        env = gym.make( env )
        env.env.model_xml = options.model
    else:
        gym_env = AnimatEnv(options.model)
        env = EnvWrapper(gym_env, basis_order=3, normalization=1)

    agent = LinearAgent(env, meta_policy=None, algo="PPO")
    agent.random_action_prob = 0.0

    agent.learning_algorithm.load_model(options.actor, options.critic)
    agent.play( max_steps=10000, delay=0.01, save_path=options.save_path)

    time.sleep(0.5)
Beispiel #6
0
    def experiment_explore_vs_exploit(meta_path, save_directory, setups,
                                      episodes, steps):

        alpha = 0.001
        basis_order = 3

        env = AnimatEnv("./CustomEnvironments/maze1.txt")
        env.reset()

        (obs, reward, done, info) = env.step(env.action_space.sample())
        obs = EnvWrapper.normalize_range(obs, env.env_range)
        phi = fourier_basis(obs, order=basis_order)

        num_features = phi.shape[0]
        num_actions = env.action_space.n

        if os.path.isdir(save_directory) == False:
            os.mkdir(save_directory)

        num_samples = 5

        meta = pickle.load(open(meta_path, "rb"))

        agents = []
        for setup in setups:
            gym_env = AnimatEnv(setup)

            env = EnvWrapper(gym_env, basis_order=basis_order, normalization=1)
            agent = LinearAgent(env,
                                meta_policy=meta,
                                alpha=alpha,
                                algo="REINFORCE")

            agents.append(agent)

        policies = []
        for agent in agents:
            rewards = agent.train(num_episodes=episodes,
                                  max_steps=steps,
                                  verbose=True,
                                  update_meta=False,
                                  render=False)
            policies.append(copy.deepcopy(agent.learning_algorithm))

        rewards = []
        for i, agent in enumerate(agents):
            agent.learning_algorithm = policies[i]
            agent.random_action_prob = 0.0
            agent.RANDOM_ACTION_DECAY = 1.0
            exploit_rewards = agent.train(num_episodes=episodes,
                                          max_steps=steps,
                                          verbose=True,
                                          update_meta=False,
                                          render=False)

            agent.random_action_prob = 1.0
            explore_rewards = agent.train(num_episodes=episodes,
                                          max_steps=steps,
                                          verbose=True,
                                          update_meta=False,
                                          render=False)

            rewards.append({
                "explore": explore_rewards,
                "exploit": exploit_rewards
            })

        pickle.dump(rewards, open(save_directory + "/explore_exploit.pkl",
                                  "wb"))
Beispiel #7
0
    def experiment_with_without_actions(meta_path, save_directory, setups,
                                        episodes, steps):

        alpha = 0.001
        basis_order = 3

        env = AnimatEnv("./CustomEnvironments/maze1.txt")
        env.reset()

        (obs, reward, done, info) = env.step(env.action_space.sample())
        obs = EnvWrapper.normalize_range(obs, env.env_range)
        phi = fourier_basis(obs, order=basis_order)

        num_features = phi.shape[0]
        num_actions = env.action_space.n

        if os.path.isdir(save_directory) == False:
            os.mkdir(save_directory)

        num_samples = 5

        for k in range(2):
            meta = pickle.load(open(meta_path, "rb"))

            agents = []
            for setup in setups:
                domain_agents = []

                for _ in range(num_samples):
                    gym_env = AnimatEnv(setup)

                    env = EnvWrapper(gym_env,
                                     basis_order=basis_order,
                                     normalization=1)
                    if k == 0:
                        prevent_actions = gym_env.action_space.useless_actions
                    else:
                        prevent_actions = None

                    agent = LinearAgent(env,
                                        meta_policy=meta,
                                        algo="REINFORCE",
                                        prevent_actions=prevent_actions)
                    domain_agents.append(agent)

                agents.append(domain_agents)

            domain_rewards_by_episode = {}
            for ep in range(episodes):

                trajectories_by_domain = ExperimentsAnimat._run_episode(
                    domain_agents=agents, num_steps=steps)

                domain_samples = []
                for i in trajectories_by_domain.keys():
                    sample_rewards = []
                    for j in range(len(trajectories_by_domain[i])):
                        t_rewards = []
                        for t in trajectories_by_domain[i][j]:
                            action, explore = t['action']

                            t_rewards.append(t['reward'])

                        sample_rewards.append(t_rewards)
                    domain_samples.append(sample_rewards)

                print("Episode %d" % (ep))
                domain_rewards_by_episode[ep] = domain_samples

            filename = "without_actions.pkl" if k == 0 else "with_actions.pkl"

            pickle.dump(domain_rewards_by_episode,
                        open(save_directory + "/" + filename, "wb"))
Beispiel #8
0
    def experiment_meta_vs_random(meta_actor, meta_critic, save_directory,
                                  setups, episodes, steps):

        alpha = 1e-4
        beta = 1e-3
        basis_order = 3

        env = AnimatEnv("./CustomEnvironments/maze1.txt")
        env.reset()

        (obs, reward, done, info) = env.step(env.action_space.sample())
        obs = EnvWrapper.normalize_range(obs, env.env_range)
        phi = fourier_basis(obs, order=basis_order)

        num_features = phi.shape[0]
        num_actions = env.action_space.n
        env = EnvWrapper(env, basis_order=basis_order, normalization=1)

        if os.path.isdir(save_directory) == False:
            os.mkdir(save_directory)

        num_samples = 3

        for k in range(2):
            k = 1
            if k == 0:
                meta = MetaPolicy(num_features=num_features,
                                  num_actions=num_actions,
                                  algo="PPO",
                                  alpha=1e-3,
                                  beta=1e-3,
                                  env=env)
                meta.learning_algorithm.load_model(meta_actor, meta_critic)
            else:
                meta = None

            agents = []
            for setup in setups:
                domain_agents = []

                for _ in range(num_samples):
                    gym_env = AnimatEnv(setup)

                    env = EnvWrapper(gym_env,
                                     basis_order=basis_order,
                                     normalization=1)
                    agent = LinearAgent(env,
                                        meta_policy=meta,
                                        algo="PPO",
                                        alpha=alpha,
                                        beta=beta)
                    agent.learning_algorithm.t_length = 8
                    agent.learning_algorithm.update_steps = 64
                    agent.learning_algorithm.epochs = 4
                    agent.learning_algorithm.batch_size = 16
                    domain_agents.append(agent)

                agents.append(domain_agents)

            domain_rewards_by_episode = {}
            null_action_by_episode = {}
            for ep in range(episodes):
                null_actions = {}
                trajectories_by_domain = ExperimentsAnimat._run_episode(
                    domain_agents=agents, num_steps=steps)

                domain_samples = []
                for i in trajectories_by_domain.keys():
                    sample_rewards = []
                    for j in range(len(trajectories_by_domain[i])):
                        t_rewards = []
                        for t in trajectories_by_domain[i][j]:
                            action, explore = t['action']
                            a = agents[i][j].env.env.action_space.actions[
                                action]
                            effect = agents[i][
                                j].env.env.animat._action_effect(a)
                            if math.fabs(effect[0]) < 0.1 and math.fabs(
                                    effect[1]) < 0.1:
                                if action in null_actions:
                                    null_actions[action] += 1
                                else:
                                    null_actions[action] = 1

                            t_rewards.append(t['reward'])

                        sample_rewards.append(sum(t_rewards))
                    domain_samples.append(sample_rewards)

                print("Episode %d" % (ep))
                domain_rewards_by_episode[ep] = domain_samples
                null_action_by_episode[ep] = null_actions

                if ep % 10 == 0:
                    filename = "meta_test_" + str(
                        ep) + ".pkl" if k == 0 else "no_meta_test_" + str(
                            ep) + ".pkl"
                    filename2 = "null_actions_meta_" + str(
                        ep
                    ) + ".pkl" if k == 0 else "null_actions_no_meta_" + str(
                        ep) + ".pkl"
                    pickle.dump(domain_rewards_by_episode,
                                open(save_directory + "/" + filename, "wb"))
                    pickle.dump(null_action_by_episode,
                                open(save_directory + "/" + filename2, "wb"))

                    for ai, a in enumerate(agents):
                        type_ = "meta_" if k == 0 else "no_meta_"
                        type_ += str(ai) + "_"
                        a[0].learning_algorithm.save_model(
                            save_directory + "/" + type_, ep)
Beispiel #9
0
    def experiment_random_baseline(save_directory):

        env = AnimatEnv("./CustomEnvironments/maze1.txt")
        env.reset()
        basis_order = ExperimentsAnimat.RECORDED_DATA[0]['order']

        (obs, reward, done, info) = env.step(env.action_space.sample())
        obs = EnvWrapper.normalize_range(obs, env.env_range)
        phi = fourier_basis(obs, order=basis_order)

        num_features = phi.shape[0]
        num_actions = env.action_space.n

        if os.path.isdir(save_directory) == False:
            os.mkdir(save_directory)

        meta_train_episodes = 100
        num_samples = 1
        for k in range(meta_train_episodes):
            agents = []

            print("Loading environments...")
            r_maxs = []
            trial_by_domain = {}
            for i, d in enumerate(ExperimentsAnimat.RECORDED_DATA):
                print("Setup: " + str(d['setup']))
                setup = d['setup']
                max_r = d['max_r']
                # episodes = d['episodes']
                # steps = d['max_steps']
                episodes = 600
                steps = 600
                basis_order = d['order']

                domain_agents = []
                for _ in range(num_samples):
                    gym_env = AnimatEnv(setup)
                    env = EnvWrapper(gym_env,
                                     basis_order=basis_order,
                                     normalization=1)
                    agent = LinearAgent(env,
                                        meta_policy=None,
                                        algo="REINFORCE")
                    domain_agents.append(agent)

                agents.append(domain_agents)
                r_maxs.append(max_r)

                trial_by_domain[i] = [list() for _ in range(num_samples)]
            print("Done loading...")

            domain_rewards_by_episode = {}
            for ep in range(episodes):

                trajectories_by_domain = ExperimentsAnimat._run_episode(
                    domain_agents=agents, num_steps=steps)

                domain_samples = []
                for i in trajectories_by_domain.keys():
                    sample_rewards = []
                    for j in range(len(trajectories_by_domain[i])):
                        t_rewards = []
                        for t in trajectories_by_domain[i][j]:
                            t_rewards.append(t['reward'])
                            t['reward'] = t['reward'] / r_maxs[i]
                            trial_by_domain[i][j].append(t)

                        sample_rewards.append(t_rewards)
                    domain_samples.append(sample_rewards)

                print("Episode %d - Trial %d" % (ep, k))
                domain_rewards_by_episode[ep] = domain_samples

            pickle.dump(
                domain_rewards_by_episode,
                open(save_directory + "/trajectory_iter_" + str(k) + ".pkl",
                     "wb"))
Beispiel #10
0
    def experiment_train_meta(save_directory, meta_alpha, meta_beta):
        alpha = 1e-4
        beta = 1e-3

        gym_env = AnimatEnv("./CustomEnvironments/maze1.txt")
        gym_env.reset()
        basis_order = 0  # ExperimentsAnimat.RECORDED_DATA[0]['order']

        (obs, reward, done, info) = gym_env.step(gym_env.action_space.sample())
        # obs = EnvWrapper.normalize_range(obs, gym_env.env_range)
        # phi = fourier_basis(obs, order=basis_order)

        num_features = obs.shape[0]
        num_actions = gym_env.action_space.n

        env = EnvWrapper(gym_env, basis_order=basis_order, normalization=1)

        meta = MetaPolicy(num_features=num_features,
                          num_actions=num_actions,
                          algo="PPO",
                          alpha=meta_alpha,
                          beta=meta_beta,
                          env=env)
        meta.learning_algorithm.t_length = 32
        meta.learning_algorithm.update_steps = 256

        if os.path.isdir(save_directory) == False:
            os.mkdir(save_directory)

        meta_train_episodes = 30
        num_samples = 3
        for k in range(meta_train_episodes):
            agents = []

            print("Loading environments...")
            r_maxs = []
            trial_by_domain = {}
            for i, d in enumerate(ExperimentsAnimat.RECORDED_DATA):
                print("Setup: " + str(d['setup']))
                setup = d['setup']
                max_r = d['max_r']
                episodes = d['episodes']
                # steps = 500 #d['max_steps']
                # episodes = 1000
                steps = 600
                basis_order = d['order']

                domain_agents = []
                for _ in range(num_samples):
                    gym_env = AnimatEnv(setup)
                    env = EnvWrapper(gym_env,
                                     basis_order=basis_order,
                                     normalization=1)
                    agent = LinearAgent(env,
                                        meta_policy=meta,
                                        algo="PPO",
                                        alpha=alpha,
                                        beta=beta)
                    domain_agents.append(agent)

                    agent.learning_algorithm.t_length = 32
                    agent.learning_algorithm.update_steps = 128
                    agent.learning_algorithm.epochs = 4
                    agent.learning_algorithm.batch_size = 16

                agents.append(domain_agents)
                r_maxs.append(max_r)

                trial_by_domain[i] = [list() for _ in range(num_samples)]
            print("Done loading...")

            domain_rewards_by_episode = {}
            for ep in range(episodes):

                trajectories_by_domain = ExperimentsAnimat._run_episode(
                    domain_agents=agents, num_steps=steps, r_maxs=r_maxs)

                domain_samples = []
                for i in trajectories_by_domain.keys():
                    sample_rewards = []
                    for j in range(len(trajectories_by_domain[i])):
                        t_rewards = []
                        for t in trajectories_by_domain[i][j]:
                            t_rewards.append(t['reward'])
                            t['reward'] = t['reward'] / r_maxs[i]
                            trial_by_domain[i][j].append(t)

                        sample_rewards.append(sum(t_rewards))
                    domain_samples.append(sample_rewards)

                print("Episode %d - Trial %d" % (ep, k))
                domain_rewards_by_episode[ep] = domain_samples

                if ep % 10 == 0:
                    val = (k * episodes) + ep
                    meta.learning_algorithm.save_model(save_directory + "/",
                                                       val)
                    # pickle.dump(meta, open(save_directory+"/meta_iter_"+str(k)+".pkl", "wb"))
                    pickle.dump(
                        domain_rewards_by_episode,
                        open(
                            save_directory + "/trajectory_iter_" + str(val) +
                            ".pkl", "wb"))

            trajectories = []
            for key in trial_by_domain.keys():
                for traj in trial_by_domain[key]:
                    trajectories.append(traj)

            if meta.algo == "REINFORCE":
                print("Updating meta....")
                meta.montecarlo_update(trajectories)
Beispiel #11
0
    def experiment_meta_vs_random(meta_actor, meta_critic, save_directory, setups, episodes, steps):    

        alpha = 0.001
        basis_order = 3

        env = gym.make('CartPole-v0')
        env.reset()

        (obs, reward, done, info) = env.step(env.action_space.sample())
        obs = EnvWrapper.modified_sigmoid(obs)
        phi = fourier_basis(obs, order=basis_order)

        num_features = phi.shape[0]
        num_actions = env.action_space.n

        if os.path.isdir(save_directory) == False:
            os.mkdir(save_directory)

        num_samples = 5

        for k in range(2):
            if k == 0:
                meta = MetaPolicy(num_features=num_features, num_actions=num_actions, algo="PPO", alpha=1e-3, beta=1e-3, env=env)
                meta.learning_algorithm.load_model(meta_actor, meta_critic)
            else:
                meta = None

            agents = []
            for setup in setups:
                domain_agents = []
                for _ in range(num_samples):
                    gym_env = gym.make('CartPole-v0')
                    gym_env.env.force_mag = setup["force"]
                    gym_env.env.length = setup["pole_length"]
                    gym_env.env.masscart = setup["masscart"]
                    gym_env.env.masspole = setup["masspole"]

                    env = EnvWrapper(gym_env, basis_order=basis_order, normalization=0)
                    agent = LinearAgent(env, meta_policy=meta, alpha=alpha, algo="PPO")

                    if meta is None:
                        agent.random_action_prob = 0.0

                    domain_agents.append( agent )

                agents.append( domain_agents )

            domain_rewards_by_episode = {}
            for ep in range(episodes):
                
                trajectories_by_domain = ExperimentsCartpole._run_episode(domain_agents=agents, num_steps=steps, optimize_meta=False)
                
                domain_samples = []
                for i in trajectories_by_domain.keys():
                    sample_rewards = []
                    for j in range(len(trajectories_by_domain[i])):
                        t_rewards = []
                        for t in trajectories_by_domain[i][j]:
                            t_rewards.append( t['reward'] )
                    
                        sample_rewards.append( t_rewards )
                    domain_samples.append( sample_rewards )

                print("Episode %d" %(ep))
                domain_rewards_by_episode[ep] = domain_samples

                if ep % 100 == 0:
                    filename = "meta_test"+str(ep)+".pkl" if k == 0 else "no_meta_test_"+str(ep)+".pkl"
                    pickle.dump(domain_rewards_by_episode, open(save_directory+"/"+filename, "wb"))
Beispiel #12
0
    def experiment_random_baseline(save_directory):    

        env = gym.make('CartPole-v0')
        env.reset()
        basis_order = ExperimentsCartpole.RECORDED_DATA[0]['order']

        (obs, reward, done, info) = env.step(env.action_space.sample())
        obs = EnvWrapper.modified_sigmoid(obs)
        phi = fourier_basis(obs, order=basis_order)

        num_features = phi.shape[0]
        num_actions = env.action_space.n

        
        if os.path.isdir(save_directory) == False:
            os.mkdir(save_directory)

        meta_train_episodes = 100
        num_samples = 5
        for k in range(meta_train_episodes):
            agents = []

            print("Loading environments...")
            r_maxs = []
            trial_by_domain = {}
            for i, d in enumerate(ExperimentsCartpole.RECORDED_DATA):
                print("Setup: " + str(d['setup']))
                setup = d['setup']
                max_r = d['max_r']
                episodes = d['episodes']
                steps = d['max_steps']
                basis_order = d['order']
                # alpha = d['alpha']	        
                alpha = 0.0001
                
                domain_agents = []
                for _ in range(num_samples):
                    gym_env = gym.make('CartPole-v0')
                    gym_env.env.force_mag = setup["force"]
                    gym_env.env.length = setup["pole_length"]
                    gym_env.env.masscart = setup["masscart"]
                    gym_env.env.masspole = setup["masspole"]

                    env = EnvWrapper(gym_env, basis_order=basis_order, normalization=0)
                    agent = LinearAgent(env, meta_policy=None, alpha=alpha, algo="SARSA")

                    domain_agents.append( agent )

                agents.append( domain_agents )
                r_maxs.append( max_r )

                trial_by_domain[i] = [ list() for _ in range(num_samples) ]
            print("Done loading...")

            
            domain_rewards_by_episode = {}
            for ep in range(episodes):
                
                trajectories_by_domain = ExperimentsCartpole._run_episode(domain_agents=agents, num_steps=steps)
                
                domain_samples = []
                for i in trajectories_by_domain.keys():
                    sample_rewards = []
                    for j in range(len(trajectories_by_domain[i])):
                        t_rewards = []
                        for t in trajectories_by_domain[i][j]:
                            t_rewards.append( t['reward'] )
                    
                        sample_rewards.append( t_rewards )
                    domain_samples.append( sample_rewards )

                print("Episode %d - Trial %d" %(ep, k))
                domain_rewards_by_episode[ep] = domain_samples

            pickle.dump(domain_rewards_by_episode, open(save_directory+"/trajectory_iter_"+str(k)+".pkl", "wb"))
Beispiel #13
0
    def experiment_train_meta(save_directory, meta_alpha, meta_beta):    

        gym_env = gym.make('CartPole-v0')
        gym_env.reset()
        basis_order = ExperimentsCartpole.RECORDED_DATA[0]['order']

        (obs, reward, done, info) = gym_env.step(gym_env.env.action_space.sample())
        obs = EnvWrapper.modified_sigmoid(obs)
        phi = fourier_basis(obs, order=basis_order)

        num_features = phi.shape[0]
        num_actions = gym_env.env.action_space.n

        env = EnvWrapper(gym_env, basis_order=basis_order, normalization=0)

        meta = MetaPolicy(num_features=num_features, num_actions=num_actions, algo="PPO", alpha=meta_alpha, beta=meta_beta, env=env)

        if os.path.isdir(save_directory) == False:
            os.mkdir(save_directory)

        meta_train_episodes = 500
        num_samples = 3
        for k in range(meta_train_episodes):
            agents = []

            print("Loading environments...")
            r_maxs = []
            trial_by_domain = {}
            for i, d in enumerate(ExperimentsCartpole.RECORDED_DATA):
                print("Setup: " + str(d['setup']))
                setup = d['setup']
                max_r = d['max_r']
                episodes = d['episodes']
                steps = d['max_steps']
                basis_order = d['order']
                # alpha = d['alpha']	        
                alpha = 0.0001
                
                domain_agents = []
                for _ in range(num_samples):
                    gym_env = gym.make('CartPole-v0')
                    gym_env.env.force_mag = setup["force"]
                    gym_env.env.length = setup["pole_length"]
                    gym_env.env.masscart = setup["masscart"]
                    gym_env.env.masspole = setup["masspole"]

                    env = EnvWrapper(gym_env, basis_order=basis_order, normalization=0)
                    agent = LinearAgent(env, meta_policy=meta, alpha=alpha, algo="PPO")

                    domain_agents.append( agent )

                agents.append( domain_agents )
                r_maxs.append( max_r )

                trial_by_domain[i] = [ list() for _ in range(num_samples) ]
            print("Done loading...")

            
            domain_rewards_by_episode = {}
            for ep in range(episodes):
                
                trajectories_by_domain = ExperimentsCartpole._run_episode(domain_agents=agents, num_steps=steps, r_maxs=r_maxs)
                
                domain_samples = []
                for i in trajectories_by_domain.keys():
                    sample_rewards = []
                    for j in range(len(trajectories_by_domain[i])):
                        t_rewards = []
                        for t in trajectories_by_domain[i][j]:
                            t_rewards.append( t['reward'] )
                            t['reward'] = t['reward'] / r_maxs[i] 
                            trial_by_domain[i][j].append( t )
                    
                        sample_rewards.append( t_rewards )
                    domain_samples.append( sample_rewards )

                print("Episode %d - Trial %d" %(ep, k))
                domain_rewards_by_episode[ep] = domain_samples

            trajectories = []
            for key in trial_by_domain.keys():
                for traj in trial_by_domain[key]:
                    trajectories.append( traj )

            if meta.algo == "REINFORCE":
                print("Updating meta....")
                meta.montecarlo_update(trajectories)

            meta.learning_algorithm.save_model(save_directory+"/", k)
            # pickle.dump(meta, open(save_directory+"/meta_iter_"+str(k)+".pkl", "wb"))
            pickle.dump(domain_rewards_by_episode, open(save_directory+"/trajectory_iter_"+str(k)+".pkl", "wb"))
Beispiel #14
0
import numpy as np
import gym

basis_order = 1
alpha = 1e-2
beta = 1e-2
setup = {"force" : 20.0, "pole_length" : 1.2, "masscart" : 5.0, "masspole" : 0.1}        
gym_env = gym.make('CartPole-v0')
gym_env.env.force_mag = setup["force"]
gym_env.env.length = setup["pole_length"]
gym_env.env.masscart = setup["masscart"]
gym_env.env.masspole = setup["masspole"]

env = EnvWrapper(gym_env, basis_order=basis_order, normalization=0)

agent = LinearAgent(env, meta_policy=None, alpha=alpha, beta=beta, algo="PPO")
agent.learning_algorithm.t_length = 8
agent.learning_algorithm.update_steps = 16
agent.learning_algorithm.epochs = 4
agent.learning_algorithm.batch_size = 8

rewards = agent.train(num_episodes=500, max_steps=1000, verbose=True, update_meta=False, render=False)

rewards = [ np.mean(rewards[i*10:(i+1)*10]) for i in range(len(rewards)//10) ]
plt.plot(range(len(rewards)), rewards)
plt.show(block=True)
# for _ in range(10000):
#     reward, done, update_info = agent.perform_step(update_meta=False)
#     env.render()
#     time.sleep(1.0)
Beispiel #15
0
args = namedtuple("parser", d_args.keys())(*d_args.values())
args.out
# create the directory here and tensorboard writer here

os.makedirs(args.out, exist_ok=True)

# create the tensorboard summary writer here
tb_log_dir = os.path.join(args.log_dir, "2DGrid", name, 'tb_logs')

#Susan added this line
csv_log_dir = os.path.join(args.log_dir, "2DGrid", name, 'csv_logs')

print("Log dir", tb_log_dir)
print("Out dir", args.out)
#Susan added this line
print("csv log dir", csv_log_dir)

if args.reset_dir:
    shutil.rmtree(tb_log_dir, ignore_errors=True)
    #Susan added this line
    shutil.rmtree(csv_log_dir, ignore_errors=True)
os.makedirs(tb_log_dir, exist_ok=True)
#Susan added this line
os.makedirs(csv_log_dir, exist_ok=True)
tb_writer = SummaryWriter(log_dir=tb_log_dir)

agent = LinearAgent(args, env, action_noise, featurize_state, featurize_action, tb_log_dir, csv_log_dir)
# run the agent here

agent.run()
Beispiel #16
0
        }]
    else:
        print("Unrecognized environment: " + env_name)
        assert (False)

    if os.path.isdir(save_dir) == False:
        os.mkdir(save_dir)

    data = []
    for setup in setups:
        if env_name.lower() == "animat":
            env = init_env_animat(setup)
        elif env_name.lower() == "cartpole":
            env = init_env_cartpole(setup)

        agent = LinearAgent(env, meta_policy=None, alpha=alpha, algo=algo)

        rewards = agent.train(num_episodes=episodes,
                              max_steps=steps,
                              verbose=True,
                              update_meta=False,
                              render=False)

        setup_data = {
            'setup': setup,
            'max_r': max(rewards),
            'episodes': episodes,
            'max_steps': steps,
            'alpha': alpha,
            'order': basis_order,
            'algo': algo