Ejemplo n.º 1
0
    def experiment_explore_vs_exploit(meta_path, save_directory, setups, episodes, steps):    

        alpha = 0.0001
        basis_order = 3

        env = gym.make('CartPole-v0')
        env.reset()

        (obs, reward, done, info) = env.step(env.action_space.sample())
        obs = EnvWrapper.modified_sigmoid(obs)
        phi = fourier_basis(obs, order=basis_order)

        num_features = phi.shape[0]
        num_actions = env.action_space.n

        if os.path.isdir(save_directory) == False:
            os.mkdir(save_directory)

        num_samples = 5

        meta = pickle.load(open(meta_path, "rb")) 

        agents = []
        for setup in setups:
            gym_env = gym.make('CartPole-v0')
            gym_env.env.force_mag = setup["force"]
            gym_env.env.length = setup["pole_length"]
            gym_env.env.masscart = setup["masscart"]
            gym_env.env.masspole = setup["masspole"]
            
            env = EnvWrapper(gym_env, basis_order=basis_order, normalization=0)
            agent = LinearAgent(env, meta_policy=meta, alpha=alpha, algo="SARSA")
            

            agents.append( agent )

        policies = []
        for agent in agents:
            rewards = agent.train(num_episodes=episodes, max_steps=steps, verbose=True, update_meta=False, render=False)
            policies.append( copy.deepcopy(agent.learning_algorithm) )


        rewards = []        
        for i, agent in enumerate(agents):
            agent.learning_algorithm = policies[i]
            agent.random_action_prob = 0.0
            agent.RANDOM_ACTION_DECAY = 1.0
            exploit_rewards = agent.train(num_episodes=episodes, max_steps=steps, verbose=True, update_meta=False, render=False)

            agent.random_action_prob = 1.0
            explore_rewards = agent.train(num_episodes=episodes, max_steps=steps, verbose=True, update_meta=False, render=False)


            rewards.append( {"explore" : explore_rewards, "exploit" : exploit_rewards} )

        pickle.dump(rewards, open(save_directory+"/explore_exploit.pkl", "wb"))
Ejemplo n.º 2
0
import time

from PIL import Image

if __name__ == "__main__":
    usage = "usage: %prog [options] arg"
    parser = OptionParser(usage)
    parser.add_option("-e", "--environment", action="store", help="environment name", type="string", default="animat")
    parser.add_option("-m", "--model", action="store", help="env model", type="string")
    parser.add_option("-a", "--actor", action="store", help="agent actor", type="string")
    parser.add_option("-c", "--critic", action="store", help="agent critic", type="string")
    parser.add_option("-s", "--save_path", action="store", help="path for saving vid", type="string", default=None)

    (options, args) = parser.parse_args()

    env = options.environment

    if env != "animat":
        env = gym.make( env )
        env.env.model_xml = options.model
    else:
        gym_env = AnimatEnv(options.model)
        env = EnvWrapper(gym_env, basis_order=3, normalization=1)

    agent = LinearAgent(env, meta_policy=None, algo="PPO")
    agent.random_action_prob = 0.0

    agent.learning_algorithm.load_model(options.actor, options.critic)
    agent.play( max_steps=10000, delay=0.01, save_path=options.save_path)

    time.sleep(0.5)
Ejemplo n.º 3
0
    def experiment_explore_vs_exploit(meta_path, save_directory, setups,
                                      episodes, steps):

        alpha = 0.001
        basis_order = 3

        env = AnimatEnv("./CustomEnvironments/maze1.txt")
        env.reset()

        (obs, reward, done, info) = env.step(env.action_space.sample())
        obs = EnvWrapper.normalize_range(obs, env.env_range)
        phi = fourier_basis(obs, order=basis_order)

        num_features = phi.shape[0]
        num_actions = env.action_space.n

        if os.path.isdir(save_directory) == False:
            os.mkdir(save_directory)

        num_samples = 5

        meta = pickle.load(open(meta_path, "rb"))

        agents = []
        for setup in setups:
            gym_env = AnimatEnv(setup)

            env = EnvWrapper(gym_env, basis_order=basis_order, normalization=1)
            agent = LinearAgent(env,
                                meta_policy=meta,
                                alpha=alpha,
                                algo="REINFORCE")

            agents.append(agent)

        policies = []
        for agent in agents:
            rewards = agent.train(num_episodes=episodes,
                                  max_steps=steps,
                                  verbose=True,
                                  update_meta=False,
                                  render=False)
            policies.append(copy.deepcopy(agent.learning_algorithm))

        rewards = []
        for i, agent in enumerate(agents):
            agent.learning_algorithm = policies[i]
            agent.random_action_prob = 0.0
            agent.RANDOM_ACTION_DECAY = 1.0
            exploit_rewards = agent.train(num_episodes=episodes,
                                          max_steps=steps,
                                          verbose=True,
                                          update_meta=False,
                                          render=False)

            agent.random_action_prob = 1.0
            explore_rewards = agent.train(num_episodes=episodes,
                                          max_steps=steps,
                                          verbose=True,
                                          update_meta=False,
                                          render=False)

            rewards.append({
                "explore": explore_rewards,
                "exploit": exploit_rewards
            })

        pickle.dump(rewards, open(save_directory + "/explore_exploit.pkl",
                                  "wb"))
Ejemplo n.º 4
0
    def experiment_meta_vs_random(meta_actor, meta_critic, save_directory, setups, episodes, steps):    

        alpha = 0.001
        basis_order = 3

        env = gym.make('CartPole-v0')
        env.reset()

        (obs, reward, done, info) = env.step(env.action_space.sample())
        obs = EnvWrapper.modified_sigmoid(obs)
        phi = fourier_basis(obs, order=basis_order)

        num_features = phi.shape[0]
        num_actions = env.action_space.n

        if os.path.isdir(save_directory) == False:
            os.mkdir(save_directory)

        num_samples = 5

        for k in range(2):
            if k == 0:
                meta = MetaPolicy(num_features=num_features, num_actions=num_actions, algo="PPO", alpha=1e-3, beta=1e-3, env=env)
                meta.learning_algorithm.load_model(meta_actor, meta_critic)
            else:
                meta = None

            agents = []
            for setup in setups:
                domain_agents = []
                for _ in range(num_samples):
                    gym_env = gym.make('CartPole-v0')
                    gym_env.env.force_mag = setup["force"]
                    gym_env.env.length = setup["pole_length"]
                    gym_env.env.masscart = setup["masscart"]
                    gym_env.env.masspole = setup["masspole"]

                    env = EnvWrapper(gym_env, basis_order=basis_order, normalization=0)
                    agent = LinearAgent(env, meta_policy=meta, alpha=alpha, algo="PPO")

                    if meta is None:
                        agent.random_action_prob = 0.0

                    domain_agents.append( agent )

                agents.append( domain_agents )

            domain_rewards_by_episode = {}
            for ep in range(episodes):
                
                trajectories_by_domain = ExperimentsCartpole._run_episode(domain_agents=agents, num_steps=steps, optimize_meta=False)
                
                domain_samples = []
                for i in trajectories_by_domain.keys():
                    sample_rewards = []
                    for j in range(len(trajectories_by_domain[i])):
                        t_rewards = []
                        for t in trajectories_by_domain[i][j]:
                            t_rewards.append( t['reward'] )
                    
                        sample_rewards.append( t_rewards )
                    domain_samples.append( sample_rewards )

                print("Episode %d" %(ep))
                domain_rewards_by_episode[ep] = domain_samples

                if ep % 100 == 0:
                    filename = "meta_test"+str(ep)+".pkl" if k == 0 else "no_meta_test_"+str(ep)+".pkl"
                    pickle.dump(domain_rewards_by_episode, open(save_directory+"/"+filename, "wb"))
Ejemplo n.º 5
0
    def experiment_meta_vs_random(meta_actor, meta_critic, save_directory,
                                  xml_models, episodes, steps):

        alpha = 1e-4
        beta = 1e-3

        env = gym.make("RoboschoolHopper-v1")
        env.reset()

        (obs, reward, done, info) = env.step(env.action_space.sample())

        num_features = obs.shape[0]
        num_actions = env.action_space.low.shape[0]

        if os.path.isdir(save_directory) == False:
            os.mkdir(save_directory)

        num_samples = 5

        for k in range(2):
            if k == 0:
                meta = MetaPolicy(num_features=num_features,
                                  num_actions=num_actions,
                                  algo="PPO",
                                  alpha=1e-3,
                                  beta=1e-3,
                                  env=env)
                meta.learning_algorithm.load_model(meta_actor, meta_critic)
            else:
                meta = None

            agents = []
            for model in xml_models:
                domain_agents = []

                for _ in range(num_samples):
                    env = gym.make("RoboschoolHopper-v1")
                    env.env.model_xml = model

                    agent = LinearAgent(env,
                                        meta_policy=meta,
                                        algo="PPO",
                                        alpha=alpha,
                                        beta=beta)
                    agent.learning_algorithm.t_length = 128
                    agent.learning_algorithm.update_steps = 128
                    agent.learning_algorithm.epochs = 8

                    if meta is None:
                        agent.random_action_prob = 0.0

                    domain_agents.append(agent)

                agents.append(domain_agents)

            domain_rewards_by_episode = {}
            for ep in range(episodes):
                null_actions = {}
                trajectories_by_domain = ExperimentsHopper._run_episode(
                    domain_agents=agents, num_steps=steps, optimize_meta=False)

                domain_samples = []
                for i in trajectories_by_domain.keys():
                    sample_returns = []
                    for j in range(len(trajectories_by_domain[i])):

                        sample_returns.append(sum(
                            trajectories_by_domain[i][j]))

                    domain_samples.append(sample_returns)

                print("Episode %d / %d" % (ep, episodes))
                domain_rewards_by_episode[ep] = domain_samples

                if ep % 100 == 0:
                    filename = "meta_test_" + str(
                        ep) + ".pkl" if k == 0 else "no_meta_test_" + str(
                            ep) + ".pkl"
                    pickle.dump(domain_rewards_by_episode,
                                open(save_directory + "/" + filename, "wb"))
                    for ai, a in enumerate(agents):
                        type_ = "meta_" if k == 0 else "no_meta_"
                        type_ += str(ai) + "_"
                        a[0].learning_algorithm.save_model(
                            save_directory + "/" + type_, ep)