コード例 #1
0
    def experiment_meta_vs_random(meta_actor, meta_critic, save_directory,
                                  xml_models, episodes, steps):

        alpha = 1e-3
        beta = 1e-2

        env = gym.make("RoboschoolInvertedPendulum-v1")
        env.reset()

        (obs, reward, done, info) = env.step(env.action_space.sample())

        num_features = obs.shape[0]
        num_actions = env.action_space.low.shape[0]

        if os.path.isdir(save_directory) == False:
            os.mkdir(save_directory)

        num_samples = 5

        for k in range(1):
            if k == 0:
                meta = MetaPolicy(num_features=num_features,
                                  num_actions=num_actions,
                                  algo="PPO",
                                  alpha=1e-3,
                                  beta=1e-3,
                                  env=env)
                meta.learning_algorithm.load_model(meta_actor, meta_critic)
            else:
                meta = None

            agents = []
            for model in xml_models:
                domain_agents = []

                for _ in range(num_samples):
                    env = gym.make("RoboschoolInvertedPendulum-v1")
                    env.env.model_xml = model

                    agent = LinearAgent(env,
                                        meta_policy=meta,
                                        algo="PPO",
                                        alpha=alpha,
                                        beta=beta)
                    agent.learning_algorithm.t_length = 16
                    agent.learning_algorithm.update_steps = 64
                    agent.learning_algorithm.epochs = 8

                    domain_agents.append(agent)

                agents.append(domain_agents)

            domain_rewards_by_episode = {}
            for ep in range(episodes):
                null_actions = {}
                trajectories_by_domain = ExperimentsInvertedPendulum._run_episode(
                    domain_agents=agents, num_steps=steps, optimize_meta=False)

                domain_samples = []
                for i in trajectories_by_domain.keys():
                    sample_returns = []
                    for j in range(len(trajectories_by_domain[i])):

                        sample_returns.append(sum(
                            trajectories_by_domain[i][j]))

                    domain_samples.append(sample_returns)

                print("Episode %d / %d" % (ep, episodes))
                domain_rewards_by_episode[ep] = domain_samples

                if ep % 100 == 0:
                    filename = "meta_test_" + str(
                        ep) + ".pkl" if k == 0 else "no_meta_test_" + str(
                            ep) + ".pkl"
                    pickle.dump(domain_rewards_by_episode,
                                open(save_directory + "/" + filename, "wb"))
                    for ai, a in enumerate(agents):
                        type_ = "meta_" if k == 0 else "no_meta_"
                        type_ += str(ai) + "_"
                        a[0].learning_algorithm.save_model(
                            save_directory + "/" + type_, ep)
コード例 #2
0
    def experiment_train_meta(save_directory, meta_alpha, meta_beta,
                              xml_models):

        alpha = 1e-3
        beta = 1e-2

        env = gym.make("RoboschoolInvertedPendulum-v1")
        env.reset()

        (obs, reward, done, info) = env.step(env.action_space.sample())

        num_features = obs.shape[0]
        num_actions = env.action_space.low.shape[0]

        meta = MetaPolicy(num_features=num_features,
                          num_actions=num_actions,
                          algo="PPO",
                          alpha=meta_alpha,
                          beta=meta_beta,
                          env=env)
        meta.learning_algorithm.t_length = 32
        meta.learning_algorithm.update_steps = 64

        if os.path.isdir(save_directory) == False:
            os.mkdir(save_directory)

        meta_train_episodes = 30
        num_samples = 4
        for k in range(meta_train_episodes):
            agents = []

            print("Loading environments...")
            trial_by_domain = {}
            for i, model in enumerate(xml_models):

                domain_agents = []
                for _ in range(num_samples):
                    env = gym.make("RoboschoolInvertedPendulum-v1")
                    env.env.model_xml = model

                    agent = LinearAgent(env,
                                        meta_policy=meta,
                                        algo="PPO",
                                        alpha=alpha,
                                        beta=beta)

                    domain_agents.append(agent)

                    agent.learning_algorithm.t_length = 16
                    agent.learning_algorithm.update_steps = 64
                    agent.learning_algorithm.epochs = 8

                agents.append(domain_agents)

                trial_by_domain[i] = [list() for _ in range(num_samples)]
            print("Done loading...")

            episodes = 250
            steps = 500
            domain_rewards_by_episode = {}
            for ep in range(episodes):

                trajectories_by_domain = ExperimentsInvertedPendulum._run_episode(
                    domain_agents=agents, num_steps=steps, r_maxs=None)

                domain_samples = []
                for i in trajectories_by_domain.keys():
                    sample_returns = []
                    for j in range(len(trajectories_by_domain[i])):

                        sample_returns.append(sum(
                            trajectories_by_domain[i][j]))

                    domain_samples.append(sample_returns)

                print("Episode %d - Trial %d" % (ep, k))
                domain_rewards_by_episode[ep] = domain_samples

                if ep % 100 == 0:
                    val = (k * episodes) + ep
                    meta.learning_algorithm.save_model(save_directory + "/",
                                                       val)

                    # pickle.dump(meta, open(save_directory+"/meta_iter_"+str(k)+".pkl", "wb"))
                    pickle.dump(
                        domain_rewards_by_episode,
                        open(
                            save_directory + "/trajectory_iter_" + str(val) +
                            ".pkl", "wb"))
コード例 #3
0
    def experiment_meta_vs_random(meta_actor, meta_critic, save_directory,
                                  setups, episodes, steps):

        alpha = 1e-4
        beta = 1e-3
        basis_order = 3

        env = AnimatEnv("./CustomEnvironments/maze1.txt")
        env.reset()

        (obs, reward, done, info) = env.step(env.action_space.sample())
        obs = EnvWrapper.normalize_range(obs, env.env_range)
        phi = fourier_basis(obs, order=basis_order)

        num_features = phi.shape[0]
        num_actions = env.action_space.n
        env = EnvWrapper(env, basis_order=basis_order, normalization=1)

        if os.path.isdir(save_directory) == False:
            os.mkdir(save_directory)

        num_samples = 3

        for k in range(2):
            k = 1
            if k == 0:
                meta = MetaPolicy(num_features=num_features,
                                  num_actions=num_actions,
                                  algo="PPO",
                                  alpha=1e-3,
                                  beta=1e-3,
                                  env=env)
                meta.learning_algorithm.load_model(meta_actor, meta_critic)
            else:
                meta = None

            agents = []
            for setup in setups:
                domain_agents = []

                for _ in range(num_samples):
                    gym_env = AnimatEnv(setup)

                    env = EnvWrapper(gym_env,
                                     basis_order=basis_order,
                                     normalization=1)
                    agent = LinearAgent(env,
                                        meta_policy=meta,
                                        algo="PPO",
                                        alpha=alpha,
                                        beta=beta)
                    agent.learning_algorithm.t_length = 8
                    agent.learning_algorithm.update_steps = 64
                    agent.learning_algorithm.epochs = 4
                    agent.learning_algorithm.batch_size = 16
                    domain_agents.append(agent)

                agents.append(domain_agents)

            domain_rewards_by_episode = {}
            null_action_by_episode = {}
            for ep in range(episodes):
                null_actions = {}
                trajectories_by_domain = ExperimentsAnimat._run_episode(
                    domain_agents=agents, num_steps=steps)

                domain_samples = []
                for i in trajectories_by_domain.keys():
                    sample_rewards = []
                    for j in range(len(trajectories_by_domain[i])):
                        t_rewards = []
                        for t in trajectories_by_domain[i][j]:
                            action, explore = t['action']
                            a = agents[i][j].env.env.action_space.actions[
                                action]
                            effect = agents[i][
                                j].env.env.animat._action_effect(a)
                            if math.fabs(effect[0]) < 0.1 and math.fabs(
                                    effect[1]) < 0.1:
                                if action in null_actions:
                                    null_actions[action] += 1
                                else:
                                    null_actions[action] = 1

                            t_rewards.append(t['reward'])

                        sample_rewards.append(sum(t_rewards))
                    domain_samples.append(sample_rewards)

                print("Episode %d" % (ep))
                domain_rewards_by_episode[ep] = domain_samples
                null_action_by_episode[ep] = null_actions

                if ep % 10 == 0:
                    filename = "meta_test_" + str(
                        ep) + ".pkl" if k == 0 else "no_meta_test_" + str(
                            ep) + ".pkl"
                    filename2 = "null_actions_meta_" + str(
                        ep
                    ) + ".pkl" if k == 0 else "null_actions_no_meta_" + str(
                        ep) + ".pkl"
                    pickle.dump(domain_rewards_by_episode,
                                open(save_directory + "/" + filename, "wb"))
                    pickle.dump(null_action_by_episode,
                                open(save_directory + "/" + filename2, "wb"))

                    for ai, a in enumerate(agents):
                        type_ = "meta_" if k == 0 else "no_meta_"
                        type_ += str(ai) + "_"
                        a[0].learning_algorithm.save_model(
                            save_directory + "/" + type_, ep)
コード例 #4
0
    def experiment_train_meta(save_directory, meta_alpha, meta_beta):
        alpha = 1e-4
        beta = 1e-3

        gym_env = AnimatEnv("./CustomEnvironments/maze1.txt")
        gym_env.reset()
        basis_order = 0  # ExperimentsAnimat.RECORDED_DATA[0]['order']

        (obs, reward, done, info) = gym_env.step(gym_env.action_space.sample())
        # obs = EnvWrapper.normalize_range(obs, gym_env.env_range)
        # phi = fourier_basis(obs, order=basis_order)

        num_features = obs.shape[0]
        num_actions = gym_env.action_space.n

        env = EnvWrapper(gym_env, basis_order=basis_order, normalization=1)

        meta = MetaPolicy(num_features=num_features,
                          num_actions=num_actions,
                          algo="PPO",
                          alpha=meta_alpha,
                          beta=meta_beta,
                          env=env)
        meta.learning_algorithm.t_length = 32
        meta.learning_algorithm.update_steps = 256

        if os.path.isdir(save_directory) == False:
            os.mkdir(save_directory)

        meta_train_episodes = 30
        num_samples = 3
        for k in range(meta_train_episodes):
            agents = []

            print("Loading environments...")
            r_maxs = []
            trial_by_domain = {}
            for i, d in enumerate(ExperimentsAnimat.RECORDED_DATA):
                print("Setup: " + str(d['setup']))
                setup = d['setup']
                max_r = d['max_r']
                episodes = d['episodes']
                # steps = 500 #d['max_steps']
                # episodes = 1000
                steps = 600
                basis_order = d['order']

                domain_agents = []
                for _ in range(num_samples):
                    gym_env = AnimatEnv(setup)
                    env = EnvWrapper(gym_env,
                                     basis_order=basis_order,
                                     normalization=1)
                    agent = LinearAgent(env,
                                        meta_policy=meta,
                                        algo="PPO",
                                        alpha=alpha,
                                        beta=beta)
                    domain_agents.append(agent)

                    agent.learning_algorithm.t_length = 32
                    agent.learning_algorithm.update_steps = 128
                    agent.learning_algorithm.epochs = 4
                    agent.learning_algorithm.batch_size = 16

                agents.append(domain_agents)
                r_maxs.append(max_r)

                trial_by_domain[i] = [list() for _ in range(num_samples)]
            print("Done loading...")

            domain_rewards_by_episode = {}
            for ep in range(episodes):

                trajectories_by_domain = ExperimentsAnimat._run_episode(
                    domain_agents=agents, num_steps=steps, r_maxs=r_maxs)

                domain_samples = []
                for i in trajectories_by_domain.keys():
                    sample_rewards = []
                    for j in range(len(trajectories_by_domain[i])):
                        t_rewards = []
                        for t in trajectories_by_domain[i][j]:
                            t_rewards.append(t['reward'])
                            t['reward'] = t['reward'] / r_maxs[i]
                            trial_by_domain[i][j].append(t)

                        sample_rewards.append(sum(t_rewards))
                    domain_samples.append(sample_rewards)

                print("Episode %d - Trial %d" % (ep, k))
                domain_rewards_by_episode[ep] = domain_samples

                if ep % 10 == 0:
                    val = (k * episodes) + ep
                    meta.learning_algorithm.save_model(save_directory + "/",
                                                       val)
                    # pickle.dump(meta, open(save_directory+"/meta_iter_"+str(k)+".pkl", "wb"))
                    pickle.dump(
                        domain_rewards_by_episode,
                        open(
                            save_directory + "/trajectory_iter_" + str(val) +
                            ".pkl", "wb"))

            trajectories = []
            for key in trial_by_domain.keys():
                for traj in trial_by_domain[key]:
                    trajectories.append(traj)

            if meta.algo == "REINFORCE":
                print("Updating meta....")
                meta.montecarlo_update(trajectories)
コード例 #5
0
    def experiment_meta_vs_random(meta_actor, meta_critic, save_directory, setups, episodes, steps):    

        alpha = 0.001
        basis_order = 3

        env = gym.make('CartPole-v0')
        env.reset()

        (obs, reward, done, info) = env.step(env.action_space.sample())
        obs = EnvWrapper.modified_sigmoid(obs)
        phi = fourier_basis(obs, order=basis_order)

        num_features = phi.shape[0]
        num_actions = env.action_space.n

        if os.path.isdir(save_directory) == False:
            os.mkdir(save_directory)

        num_samples = 5

        for k in range(2):
            if k == 0:
                meta = MetaPolicy(num_features=num_features, num_actions=num_actions, algo="PPO", alpha=1e-3, beta=1e-3, env=env)
                meta.learning_algorithm.load_model(meta_actor, meta_critic)
            else:
                meta = None

            agents = []
            for setup in setups:
                domain_agents = []
                for _ in range(num_samples):
                    gym_env = gym.make('CartPole-v0')
                    gym_env.env.force_mag = setup["force"]
                    gym_env.env.length = setup["pole_length"]
                    gym_env.env.masscart = setup["masscart"]
                    gym_env.env.masspole = setup["masspole"]

                    env = EnvWrapper(gym_env, basis_order=basis_order, normalization=0)
                    agent = LinearAgent(env, meta_policy=meta, alpha=alpha, algo="PPO")

                    if meta is None:
                        agent.random_action_prob = 0.0

                    domain_agents.append( agent )

                agents.append( domain_agents )

            domain_rewards_by_episode = {}
            for ep in range(episodes):
                
                trajectories_by_domain = ExperimentsCartpole._run_episode(domain_agents=agents, num_steps=steps, optimize_meta=False)
                
                domain_samples = []
                for i in trajectories_by_domain.keys():
                    sample_rewards = []
                    for j in range(len(trajectories_by_domain[i])):
                        t_rewards = []
                        for t in trajectories_by_domain[i][j]:
                            t_rewards.append( t['reward'] )
                    
                        sample_rewards.append( t_rewards )
                    domain_samples.append( sample_rewards )

                print("Episode %d" %(ep))
                domain_rewards_by_episode[ep] = domain_samples

                if ep % 100 == 0:
                    filename = "meta_test"+str(ep)+".pkl" if k == 0 else "no_meta_test_"+str(ep)+".pkl"
                    pickle.dump(domain_rewards_by_episode, open(save_directory+"/"+filename, "wb"))
コード例 #6
0
    def experiment_train_meta(save_directory, meta_alpha, meta_beta):    

        gym_env = gym.make('CartPole-v0')
        gym_env.reset()
        basis_order = ExperimentsCartpole.RECORDED_DATA[0]['order']

        (obs, reward, done, info) = gym_env.step(gym_env.env.action_space.sample())
        obs = EnvWrapper.modified_sigmoid(obs)
        phi = fourier_basis(obs, order=basis_order)

        num_features = phi.shape[0]
        num_actions = gym_env.env.action_space.n

        env = EnvWrapper(gym_env, basis_order=basis_order, normalization=0)

        meta = MetaPolicy(num_features=num_features, num_actions=num_actions, algo="PPO", alpha=meta_alpha, beta=meta_beta, env=env)

        if os.path.isdir(save_directory) == False:
            os.mkdir(save_directory)

        meta_train_episodes = 500
        num_samples = 3
        for k in range(meta_train_episodes):
            agents = []

            print("Loading environments...")
            r_maxs = []
            trial_by_domain = {}
            for i, d in enumerate(ExperimentsCartpole.RECORDED_DATA):
                print("Setup: " + str(d['setup']))
                setup = d['setup']
                max_r = d['max_r']
                episodes = d['episodes']
                steps = d['max_steps']
                basis_order = d['order']
                # alpha = d['alpha']	        
                alpha = 0.0001
                
                domain_agents = []
                for _ in range(num_samples):
                    gym_env = gym.make('CartPole-v0')
                    gym_env.env.force_mag = setup["force"]
                    gym_env.env.length = setup["pole_length"]
                    gym_env.env.masscart = setup["masscart"]
                    gym_env.env.masspole = setup["masspole"]

                    env = EnvWrapper(gym_env, basis_order=basis_order, normalization=0)
                    agent = LinearAgent(env, meta_policy=meta, alpha=alpha, algo="PPO")

                    domain_agents.append( agent )

                agents.append( domain_agents )
                r_maxs.append( max_r )

                trial_by_domain[i] = [ list() for _ in range(num_samples) ]
            print("Done loading...")

            
            domain_rewards_by_episode = {}
            for ep in range(episodes):
                
                trajectories_by_domain = ExperimentsCartpole._run_episode(domain_agents=agents, num_steps=steps, r_maxs=r_maxs)
                
                domain_samples = []
                for i in trajectories_by_domain.keys():
                    sample_rewards = []
                    for j in range(len(trajectories_by_domain[i])):
                        t_rewards = []
                        for t in trajectories_by_domain[i][j]:
                            t_rewards.append( t['reward'] )
                            t['reward'] = t['reward'] / r_maxs[i] 
                            trial_by_domain[i][j].append( t )
                    
                        sample_rewards.append( t_rewards )
                    domain_samples.append( sample_rewards )

                print("Episode %d - Trial %d" %(ep, k))
                domain_rewards_by_episode[ep] = domain_samples

            trajectories = []
            for key in trial_by_domain.keys():
                for traj in trial_by_domain[key]:
                    trajectories.append( traj )

            if meta.algo == "REINFORCE":
                print("Updating meta....")
                meta.montecarlo_update(trajectories)

            meta.learning_algorithm.save_model(save_directory+"/", k)
            # pickle.dump(meta, open(save_directory+"/meta_iter_"+str(k)+".pkl", "wb"))
            pickle.dump(domain_rewards_by_episode, open(save_directory+"/trajectory_iter_"+str(k)+".pkl", "wb"))