Exemple #1
0
epochs = rospy.get_param("/ML/epochs")
hdim = rospy.get_param("/ML/hdim")
policy_lr = rospy.get_param("/ML/policy_lr")
value_lr = rospy.get_param("/ML/value_lr")
max_std = rospy.get_param("/ML/max_std")
clip_range = rospy.get_param("/ML/clip_range")
n_step = rospy.get_param("/ML/n_step")

gamma = rospy.get_param("/ML/gamma")
lam = rospy.get_param("/ML/lam")
episode_size = rospy.get_param("/ML/episode_size")
batch_size = rospy.get_param("/ML/batch_size")
nupdates = rospy.get_param("/ML/nupdates")
maxlen_num = rospy.get_param("/ML/maxlen_num")

agent = PPOGAEAgent(obs_dim, n_act, epochs, hdim, policy_lr, value_lr, max_std, clip_range, seed)
#agent = PPOGAEAgent(obs_dim, n_act, epochs=10, hdim=obs_dim, policy_lr=3e-3, value_lr=1e-3, max_std=1.0, clip_range=0.2, seed=seed)

'''
PPO Agent with Gaussian policy
'''
def run_episode(env, animate=False): # Run policy and collect (state, action, reward) pairs
    obs = env.reset()
    observes, actions, rewards, infos = [], [], [], []
    done = False

    for update in range(n_step):
        print("update", update)
        obs = np.array(obs)
        obs = obs.astype(np.float32).reshape((1, -1)) # numpy.ndarray (1, num_obs)
        observes.append(obs)
# import our training environment
import gym
from env.ur_door_opening_env import URSimDoorOpening

# import our training algorithms
from algorithm.ppo_gae import PPOGAEAgent

seed = 0
obs_dim = 21  # env.observation_space.shape[0] # have to change number of hdim
n_act = 6  #config: act_dim #env.action_space.n
agent = PPOGAEAgent(obs_dim,
                    n_act,
                    epochs=10,
                    hdim=64,
                    policy_lr=1e-4,
                    value_lr=1e-4,
                    max_std=1.0,
                    clip_range=0.2,
                    seed=seed)
#agent = PPOGAEAgent(obs_dim, n_act, epochs=10, hdim=obs_dim, policy_lr=3e-3, value_lr=1e-3, max_std=1.0, clip_range=0.2, seed=seed)
'''
PPO Agent with Gaussian policy
'''


def run_episode(
        env,
        animate=False):  # Run policy and collect (state, action, reward) pairs
    obs = env.reset()
    observes, actions, rewards, infos = [], [], [], []