Exemple #1
0
def train_ddpg():
    env = gimbal(5, 500)
    env = DummyVecEnv([lambda: env])
    eval_env = gimbal(5, 500)
    eval_env = DummyVecEnv([lambda: eval_env])

    # the noise objects for DDPG
    n_actions = env.action_space.shape[-1]
    param_noise = None
    action_noise = None

    model = DDPG(policy=MlpPolicy,
                 env=env,
                 gamma=0.99,
                 memory_policy=None,
                 eval_env=eval_env,
                 nb_train_steps=500,
                 nb_rollout_steps=500,
                 nb_eval_steps=500,
                 param_noise=param_noise,
                 action_noise=action_noise,
                 normalize_observations=False,
                 tau=0.001,
                 batch_size=128,
                 param_noise_adaption_interval=50,
                 normalize_returns=False,
                 enable_popart=False,
                 observation_range=(-5000.0, 5000.0),
                 critic_l2_reg=0.0,
                 return_range=(-inf, inf),
                 actor_lr=0.0001,
                 critic_lr=0.001,
                 clip_norm=None,
                 reward_scale=1.0,
                 render=False,
                 render_eval=False,
                 memory_limit=50000,
                 verbose=1,
                 tensorboard_log="./logs",
                 _init_setup_model=True,
                 policy_kwargs=None,
                 full_tensorboard_log=False)
    #model = DDPG.load("./models/baseline_ddpg_t2")
    #model.set_env(env)
    model.learn(total_timesteps=1000000,
                callback=None,
                seed=None,
                log_interval=100,
                tb_log_name='DDPG',
                reset_num_timesteps=True)
    model.save("./models/baseline_ddpg_t2")
Exemple #2
0
def train_ppo2_mlp():
    n_cpu = 4
    env = SubprocVecEnv([lambda: gimbal(5, 500) for i in range(n_cpu)])
    model = PPO2(policy=MlpPolicy,
                 env=env,
                 gamma=0.99,
                 n_steps=100,
                 ent_coef=0.01,
                 learning_rate=0.00025,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lam=0.95,
                 nminibatches=4,
                 noptepochs=4,
                 cliprange=0.2,
                 verbose=1,
                 tensorboard_log="./logs",
                 _init_setup_model=True,
                 policy_kwargs=None,
                 full_tensorboard_log=False)
    model.learn(total_timesteps=1000000,
                callback=None,
                seed=None,
                log_interval=1,
                tb_log_name='PPO2',
                reset_num_timesteps=True)
    model.save("./models/baseline_ppo2_t11_camshifted")
Exemple #3
0
def objective(trial):
    # Hyper-parameters to adjust
    policy = trial.suggest_categorical('policy', ['MlpPolicy', 'MlpLnPolicy', 'MlpLstmPolicy', 'MlpLnLstmPolicy'])
    gamma = trial.suggest_uniform('gamma', 0.10, 1.0)
    ent_coef = trial.suggest_uniform('ent_coef', 0.01, 0.10)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)
    vf_coef = trial.suggest_uniform('vf_coef', 0.10, 1.0)
    lam = trial.suggest_uniform('lam', 0.01, 0.95)

    if policy == 'MlpPolicy':
        policy = MlpPolicy
    elif policy == 'MlpLnPolicy':
        policy = MlpLnPolicy
    elif policy == 'MlpLstmPolicy':
        policy = MlpLstmPolicy
    elif policy == 'MlpLnLstmPolicy':
        policy = MlpLnLstmPolicy

    # Train with those hyper-parameters
    n_cpu = 4
    env = SubprocVecEnv([lambda: gimbal(5, 500) for i in range(n_cpu)])
    model = PPO2(policy=policy, env=env, gamma=gamma, n_steps=100, ent_coef=ent_coef, learning_rate=learning_rate, 
                vf_coef=vf_coef, max_grad_norm=0.5, lam=lam, nminibatches=4, noptepochs=4, cliprange=0.2, 
                verbose=0, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False)
    model.learn(total_timesteps=250000, callback=None, seed=None, log_interval=1, tb_log_name='PPO2', reset_num_timesteps=True)

    # Calculate worth
    env = gimbal(5, 500)
    MAX_episodes = 25
    reward_avg = 0
    for episodes in range(MAX_episodes):
        obs = env.reset()
        r = 0
        while True:
            action, _states = model.predict(obs)
            obs, rewards, dones, info = env.step(action)
            r += rewards
            #env.render()
            if dones:
                reward_avg += r
                break
    return - (reward_avg / MAX_episodes)
Exemple #4
0
 def run(self):
     gm = gimbal(5, 500)
     obs = gm.reset()
     action = [5, -5]
     for _ in range(50000):
         gm.render()
         print(gm.observation_space.shape)
         #action = np.random.uniform(low=-6.13, high=6.13, size=2)
         obs, rwd, done, info = gm.step(action)
         if done:
             obs = gm.reset()
     gm.close()
Exemple #5
0
def train_ppo2_mlplstm():
    env = DummyVecEnv([lambda: gimbal(5, 500)])
    #model = PPO2(policy=MlpLstmPolicy, env=env, gamma=0.99, n_steps=500, ent_coef=0.01, learning_rate=0.00025,
    #            vf_coef=0.5, max_grad_norm=0.5, lam=0.95, nminibatches=1, noptepochs=4, cliprange=0.2,
    #            verbose=1, tensorboard_log="./logs", _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False)
    model = PPO2.load("./models/baseline_ppo2_t6_dynamicR", env=env)
    model.learn(total_timesteps=500000,
                callback=None,
                seed=None,
                log_interval=1,
                tb_log_name='PPO2',
                reset_num_timesteps=True)
    model.save("./models/baseline_ppo2_t6_dynamicR")
Exemple #6
0
def train_her():
    env = gimbal(5, 500)
    n_sampled_goal = 4
    model = HER('MlpPolicy',
                env,
                SAC,
                n_sampled_goal=n_sampled_goal,
                goal_selection_strategy='future',
                verbose=1,
                buffer_size=int(1e6),
                learning_rate=1e-3,
                gamma=0.95,
                batch_size=256,
                policy_kwargs=dict(layers=[256, 256, 256]))
Exemple #7
0
def view_ppo2_mlplstm():
    env = gimbal(5, 500)
    env = DummyVecEnv([lambda: gimbal(5, 500)])
    model = PPO2.load("./models/baseline_ppo2_t6_dynamicR")
    success_rate = 0
    reward_avg = 0
    for episodes in range(50):
        obs = env.reset()
        state = None
        done = [False]
        r = 0
        while True:
            action, state = model.predict(obs, state=state, mask=done)
            obs, rewards, done, _ = env.step(action)
            r += rewards
            env.render()
            if done:
                if r > -100:
                    success_rate += 1
                    reward_avg += r
                break
    print("Success rate: ", success_rate, "Avg rewards: ",
          (reward_avg / success_rate))
Exemple #8
0
def view_ppo2_mlp():
    env = gimbal(5, 500)
    env = DummyVecEnv([lambda: gimbal(5, 500)])
    model = PPO2.load("./models/baseline_ppo2_t7_prune1", env=env)

    success_rate = 0
    reward_avg = 0
    for episodes in range(50):
        obs = env.reset()
        r = 0
        while True:
            action, _states = model.predict(obs)
            obs, rewards, dones, info = env.step(action)
            r += rewards
            #env.render()
            #env.target_ctrl()
            if dones:
                if r > -100:
                    success_rate += 1
                    reward_avg += r
                break
    print("Success rate: ", success_rate, "Avg rewards: ",
          (reward_avg / success_rate))
Exemple #9
0
def train_gail_withppo2():
    env = gimbal(5, 500)
    env = DummyVecEnv([lambda: env])
    model = PPO2.load("./models/baseline_ppo2_t1")
    generate_expert_traj(model,
                         './models/baseline_expert_t1',
                         env,
                         n_timesteps=0,
                         n_episodes=100)
    dataset = ExpertDataset(expert_path='./models/baseline_expert_t1.npz',
                            traj_limitation=-1,
                            verbose=1)
    model = GAIL("MlpPolicy", env, dataset, verbose=1)
    model.learn(total_timesteps=500000)
    model.save("./models/baseline_gail_ppo2_t1")
Exemple #10
0
def view_ddpg():
    env = gimbal(5, 500)
    model = DDPG.load("./models/baseline_ddpg_t2")
    success_rate = 0
    reward_avg = 0
    for episodes in range(50):
        obs = env.reset()
        while True:
            action, _states = model.predict(obs)
            obs, rewards, dones, info = env.step(action)
            env.render()
            if dones:
                if rewards > -100:
                    success_rate += 1
                    reward_avg += rewards
                break
    print("Success rate: ", success_rate, "Avg rewards: ",
          reward_avg / success_rate)
Exemple #11
0
def view(n_episodes, frame_skip, episode_length, success_threshold, model,
         actions, features, high, low, actor_lr, critic_lr, rewardDecay,
         priorityA):
    test_rewards = []
    dones_anyreward = 0
    dones_goodreward = 0
    env = gimbal(frame_skip, episode_length)
    agent = DDPGAgent(env,
                      n_actions=actions,
                      n_features=features,
                      featurize=False,
                      action_high=high,
                      action_low=low,
                      actor_learning_rate=actor_lr,
                      critic_learning_rate=critic_lr,
                      reward_decay=rewardDecay,
                      priority_alpha=priorityA)
    agent.saver.restore(agent.sess, model + str("_BEST.ckpt"))
    for i_episode in range(n_episodes):
        state = agent.env.reset()
        r = 0
        while True:
            agent.env.render()
            action = agent.choose_action([state], 0, agent.action_low,
                                         agent.action_high)
            next_state, reward, done, info = agent.env.step(action)
            r += reward
            state = next_state
            if done:
                if agent.env.timestep < agent.env.MAX_timestep:
                    dones_anyreward += 1
                    if r > success_threshold:
                        dones_goodreward += 1
                print("episode:", i_episode + 1, "rewards: %.2f" % r, end="\r")
                test_rewards += [r]
                break
    print("\n")
    print("finished testing! Average reward: ",
          np.sum(test_rewards) / n_episodes, "Dones (any reward): ",
          dones_anyreward, "Dones (good reward)", dones_goodreward)
Exemple #12
0
def conduct_test(agent, n_episodes, frame_skip, episode_length,
                 success_threshold, success_rate):
    env = gimbal(frame_skip, episode_length)
    successes = 0
    rewards_sum = 0
    for _ in range(n_episodes):
        state = env.reset()
        r = 0
        while True:
            action = agent.choose_action([state], 0, agent.action_low,
                                         agent.action_high)
            next_state, reward, done, info = env.step(action)
            r += reward
            state = next_state
            if done:
                if r >= success_threshold:
                    successes += 1
                    rewards_sum += r
                break
    env.close()
    if successes >= success_rate:
        return True, rewards_sum / successes
    else:
        return False, 0