Beispiel #1
0
def train_policy(num_of_envs, log_relative_path, maximum_episode_length,
                 skip_frame, seed_num, sac_config, total_time_steps,
                 validate_every_timesteps, task_name):
    def _make_env(rank):
        def _init():
            task = generate_task(task_generator_id=task_name)
            env = CausalWorld(task=task,
                              skip_frame=skip_frame,
                              enable_visualization=False,
                              seed=seed_num + rank,
                              max_episode_length=maximum_episode_length)
            env = HERGoalEnvWrapper(env)
            return env

        set_global_seeds(seed_num)
        return _init

    os.makedirs(log_relative_path)
    env = SubprocVecEnv([_make_env(rank=i) for i in range(num_of_envs)])
    model = HER('MlpPolicy',
                env,
                SAC,
                verbose=1,
                policy_kwargs=dict(layers=[256, 256, 256]),
                **sac_config)
    save_config_file(sac_config,
                     _make_env(0)(),
                     os.path.join(log_relative_path, 'config.json'))
    for i in range(int(total_time_steps / validate_every_timesteps)):
        model.learn(total_timesteps=validate_every_timesteps,
                    tb_log_name="sac",
                    reset_num_timesteps=False)
    model.save(os.path.join(log_relative_path, 'saved_model'))
    return
Beispiel #2
0
def main(argv):

    numControlledJoints = 6
    fixed = False
    normalize_observations = False
    gamma = 0.9
    batch_size = 16
    memory_limit = 1000000
    normalize_returns = True
    timesteps = 1000000
    policy_name = "reaching_policy"
    discreteAction = 0
    rend = False

    kukaenv = kukaReachGymEnvHer(urdfRoot=robot_data.getDataPath(),
                                 renders=rend,
                                 useIK=0,
                                 isDiscrete=discreteAction,
                                 numControlledJoints=numControlledJoints,
                                 fixedPositionObj=fixed,
                                 includeVelObs=True)
    kukaenv = Monitor(kukaenv, log_dir, allow_early_resets=True)

    n_actions = kukaenv.action_space.shape[-1]
    param_noise = None
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(0.5) *
                                                np.ones(n_actions))
    model_class = DDPG
    goal_selection_strategy = 'future'
    model = HER(CustomPolicy,
                kukaenv,
                model_class,
                n_sampled_goal=4,
                goal_selection_strategy=goal_selection_strategy,
                verbose=1,
                tensorboard_log=
                "../pybullet_logs/kuka_reach_ddpg/reaching_DDPG_HER_PHASE",
                buffer_size=1000000,
                batch_size=64,
                random_exploration=0.3,
                action_noise=action_noise)

    print(colored("-----Timesteps:", "red"))
    print(colored(timesteps, "red"))
    print(colored("-----Number Joints Controlled:", "red"))
    print(colored(numControlledJoints, "red"))
    print(colored("-----Object Position Fixed:", "red"))
    print(colored(fixed, "red"))
    print(colored("-----Policy Name:", "red"))
    print(colored(policy_name, "red"))
    print(colored("------", "red"))
    print(colored("Launch the script with -h for further info", "red"))

    model.learn(total_timesteps=timesteps, log_interval=100, callback=callback)

    print("Saving model to kuka.pkl")
    model.save("../pybullet_logs/kukareach_ddpg_her/" + policy_name)

    del model  # remove to demonstrate saving and loading
Beispiel #3
0
def main(load_policy=True):
    global log_dir
    model_class = TD3  # works also with SAC and DDPG
    action_space = 6
    gamma = 0.9
    memory_limit = 1000000
    timesteps = 15000000
    discreteAction = 0
    rend = False
    # learning rate


    env = bioEnv()
  
    env = Monitor(env, log_dir, allow_early_resets=True)

    goal_selection_strategy = 'future'
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))
    # Wrap the model

    model = HER(CustomTD3Policy, env, model_class,n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy,
                verbose=1,tensorboard_log="../pybullet_logs/bioEnv_TD3", buffer_size=1000000,batch_size= 256,
                random_exploration=0.3, action_noise=action_noise)
    
    if (load_policy):
        model = HER.load("models/TD3/curriculum/best_model_part_11_10g_TRUE.pkl", env=env, n_sampled_goal=4,
        goal_selection_strategy=goal_selection_strategy,
        tensorboard_log="../pybullet_logs/bioEnv_TD3",
        buffer_size=1000000,batch_size=256,random_exploration=0.3, action_noise=action_noise)
    
    model.learn(timesteps,log_interval=100, callback = callback)
   
    model.save("policy_TD3_Discr")
Beispiel #4
0
def main(
  training_env: PSMCartesianHERDDPGEnv,
  eval_env: PSMCartesianHERDDPGEnv = None,
  log_dir='./.logs/results'
):

  os.makedirs(log_dir, exist_ok=True)

  # training_env = Monitor(training_env, log_dir)

  n_actions = training_env.action_space.shape[0]
  noise_std = 0.2
  # Currently using OU noise
  action_noise = OrnsteinUhlenbeckActionNoise(
    mean=np.zeros(n_actions),
    sigma=noise_std * np.ones(n_actions)
  )
  model_class = DDPG  # works also with SAC, DDPG and TD3

  rl_model_kwargs = {
    'actor_lr': 1e-3,
    'critic_lr': 1e-3,
    'action_noise': action_noise,
    'nb_train_steps': 300,
    'nb_rollout_steps': 100,
    'gamma': 0.95,
    'observation_range': (-1.5,
                          1.5),
    'random_exploration': 0.05,
    'normalize_observations': True,
    'critic_l2_reg': 0.01
  }

  # Available strategies (cf paper): future, final, episode, random
  model = HER(
    'MlpPolicy',
    training_env,
    model_class,
    verbose=1,
    n_sampled_goal=4,
    goal_selection_strategy='future',
    buffer_size=int(1e5),
    batch_size=128,
    tensorboard_log="./ddpg_dvrk_tensorboard/",
    **rl_model_kwargs
  )
  # Reset the model
  training_env.reset()
  # Create callbacks
  checkpoint_callback = CheckpointCallback(
    save_freq=100000,
    save_path="./ddpg_dvrk_tensorboard/"
  )  # save_path="./.model/model_checkpoint/") #save_freq=100000
  # eval_callback = EvalCallback(training_env, best_model_save_path='./ddpg_dvrk_tensorboard/best_model',
  #                             log_path=log_dir, eval_freq=500)
  callback = CallbackList([checkpoint_callback])  # , eval_callback])
  # Train the model
  model.learn(4000000, log_interval=100, callback=callback)
  model.save("./her_robot_env")
Beispiel #5
0
def test_model_manipulation(model_class, goal_selection_strategy):
    env = BitFlippingEnv(N_BITS,
                         continuous=model_class in [DDPG, SAC],
                         max_steps=N_BITS)
    env = DummyVecEnv([lambda: env])

    model = HER('MlpPolicy',
                env,
                model_class,
                n_sampled_goal=3,
                goal_selection_strategy=goal_selection_strategy,
                verbose=0)
    model.learn(1000)

    model_predict(model, env, n_steps=100, additional_check=None)

    model.save('./test_her')
    del model

    # NOTE: HER does not support VecEnvWrapper yet
    with pytest.raises(AssertionError):
        model = HER.load('./test_her', env=VecNormalize(env))

    model = HER.load('./test_her')

    # Check that the model raises an error when the env
    # is not wrapped (or no env passed to the model)
    with pytest.raises(ValueError):
        model.predict(env.reset())

    env_ = BitFlippingEnv(N_BITS,
                          continuous=model_class in [DDPG, SAC],
                          max_steps=N_BITS)
    env_ = HERGoalEnvWrapper(env_)

    model_predict(model, env_, n_steps=100, additional_check=None)

    model.set_env(env)
    model.learn(1000)

    model_predict(model, env_, n_steps=100, additional_check=None)

    assert model.n_sampled_goal == 3

    del model

    env = BitFlippingEnv(N_BITS,
                         continuous=model_class in [DDPG, SAC],
                         max_steps=N_BITS)
    model = HER.load('./test_her', env=env)
    model.learn(1000)

    model_predict(model, env_, n_steps=100, additional_check=None)

    assert model.n_sampled_goal == 3

    if os.path.isfile('./test_her.pkl'):
        os.remove('./test_her.pkl')
def main(load_policy=False):
    global log_dir
    model_class = TD3  # works also with SAC and DDPG
    action_space = 7
    normalize_observations = False
    gamma = 0.9
    memory_limit = 1000000
    normalize_returns = True
    timesteps = 8000000
    rend = False

    obj_pose_rnd_std = 0

    env = pandaPushGymGoalEnv(renders=rend,
                              use_IK=0,
                              numControlledJoints=action_space,
                              obj_pose_rnd_std=obj_pose_rnd_std,
                              includeVelObs=True)

    env = Monitor(env, log_dir, allow_early_resets=True)

    goal_selection_strategy = 'future'
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(0.5) *
                                                np.ones(n_actions))
    # Wrap the model

    model = HER(
        CustomTD3Policy,
        env,
        model_class,
        n_sampled_goal=4,
        goal_selection_strategy=goal_selection_strategy,
        verbose=1,
        tensorboard_log=
        "../pybullet_logs/panda_push_TD3/stable_baselines/TD3_phase1_target_fixed",
        buffer_size=1000000,
        batch_size=256,
        random_exploration=0.3,
        action_noise=action_noise)

    if (load_policy):
        model = HER.load(
            "../policies/USEFUL_POLICIES/PUSHING_TD3+HER_FIXED_POSITIONbest_model.pkl",
            env=env,
            n_sampled_goal=4,
            goal_selection_strategy=goal_selection_strategy,
            tensorboard_log=
            "../pybullet_logs/panda_push_TD3/stable_baselines/TD3_phase1_target_fixed",
            buffer_size=1000000,
            batch_size=256,
            random_exploration=0.3,
            action_noise=action_noise)

    model.learn(timesteps, log_interval=100, callback=callback)
    print("Saving Policy PHASE_1")
    model.save("../policies/TD3_phase1_target_fixed")
Beispiel #7
0
def main():
    model_class = DDPG  # works also with SAC and DDPG

    # -j
    action_space = 7
    # -p
    fixed = True
    # -o
    normalize_observations = False
    # -g
    gamma = 0.9
    # -b
    #batch_size = 16
    # -m
    memory_limit = 1000000
    # -r
    normalize_returns = True
    # -t
    timesteps = 1000000
    policy_name = "pushing_policy"
    discreteAction = 0
    rend = False
    env = pandaPushGymEnvHERRand(urdfRoot=robot_data.getDataPath(),
                                 renders=rend,
                                 useIK=0,
                                 isDiscrete=discreteAction,
                                 action_space=action_space,
                                 fixedPositionObj=fixed,
                                 includeVelObs=True)

    # Available strategies (cf paper): future, final, episode, random
    goal_selection_strategy = 'future'  # equivalent to GoalSelectionStrategy.FUTURE
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(0.5) *
                                                np.ones(n_actions))
    # Wrap the model

    model = HER(
        CustomPolicy,
        env,
        model_class,
        n_sampled_goal=4,
        goal_selection_strategy=goal_selection_strategy,
        verbose=1,
        tensorboard_log=
        "../pybullet_logs/panda_push_ddpg/stable_baselines/DDPG+HER_FIXED_DYN_RAND",
        buffer_size=1000000,
        batch_size=256,
        random_exploration=0.3,
        action_noise=action_noise)

    # Train the model starting from a previous policy
    model.learn(timesteps)
    print("Saving Policy")
    model.save("../policies/pushing_fixed_HER_Dyn_Rand")
def launchAgent(model_name: str):
    """
    :param model_name: 실행시킬 모델의 종류. HER, DDPG, PPO2 혹은 기타값(DQN)이어야 함
                        현재는 의도상 PPO2로 세팅할 것
    :return: 1000회의 사이클을 돌고 난 이후의 모델
    """
    import Reinforcement_AI.env.e_enhanced_image_env as image_env
    from stable_baselines import DQN, HER, DDPG, PPO2
    from stable_baselines.common import make_vec_env

    print("Current Env is " + model_name)

    if model_name == "HER":
        env = image_env.DetailedMiniMapEnv()
        model = HER("CnnPolicy", env=env, model_class=DQN)
    if model_name == "DDPG":
        env = image_env.DDPGImageEnv()
        model = DDPG(policy="CnnPolicy", env=env, normalize_observations=True)
    if model_name == "PPO2":
        env = make_vec_env(image_env.DetailedMiniMapEnv, n_envs=1)
        model = PPO2(policy="CnnPolicy", env=env, verbose=1)
    else:
        env = image_env.DetailedMiniMapEnv()
        model = DQN(
            "CnnPolicy",  # policy
            env=env,  # environment
            double_q=True,  # Double Q enable
            prioritized_replay=True,  # Replay buffer enabled
            verbose=0  # log print
        )

    for i in range(1000):
        if i != 0:
            if model_name == "HER":
                model = HER.load("detailedmap_HER_" + str(i), env)
            if model_name == "DDPG":
                model = DDPG.load("detailedmap_DDPG_" + str(i), env)
            if model_name == "PPO2":
                model = PPO2.load("detailedmap_PPO2_" + str(i), env)
            else:
                model = DQN.load("detailedmap_DQN_" + str(i), env)

        # print('model learn start')
        model.learn(total_timesteps=12500)  #FPS가 130이상 넘어갈때의 최소수치
        print("this model is : detailedmap_" + model_name + "_" + str(i + 1))
        # print('model learn finished')

        # print('model save start')
        model.save("detailedmap_" + model_name + "_" + str(i + 1))
        del model
        # print('model save end')

    return model
def heralgorithm():

    goal_selection_strategy = 'future'  # equivalent to GoalSelectionStrategy.FUTURE

    # Wrap the model
    model = HER('MlpPolicy',
                env1,
                DDPG,
                n_sampled_goal=4,
                goal_selection_strategy=goal_selection_strategy,
                verbose=1)
    # Train the model
    model.learn(1000)

    model.save("./her_bit_env")
Beispiel #10
0
def train_HER(env_train, model_name, timesteps=50000):
    start = time.time()
    n_sampled_goal = 4
    goal_selection_strategy = 'future'
    model = HER('MlpPolicy',
                env_train,
                model_class=SAC,
                verbose=0,
                n_sampled_goal=n_sampled_goal,
                goal_selection_strategy=goal_selection_strategy)
    model.learn(total_timesteps=timesteps)
    end = time.time()

    model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
    print('Training time (HER): ', (end - start) / 60, ' minutes')
    return model
Beispiel #11
0
def main(env):

    n_actions = env.action_space.shape[0]
    noise_std = 0.2
    # Currently using OU noise
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=noise_std *
                                                np.ones(n_actions))
    model_class = DDPG  # works also with SAC, DDPG and TD3

    rl_model_kwargs = {
        'actor_lr': 1e-3,
        'critic_lr': 1e-3,
        'action_noise': action_noise,
        'nb_train_steps': 300,
        'nb_rollout_steps': 100,
        'gamma': 0.95,
        'observation_range': (-1.5, 1.5),
        'random_exploration': 0.05,
        'normalize_observations': True,
        'critic_l2_reg': 0.01
    }

    # Available strategies (cf paper): future, final, episode, random
    model = HER('MlpPolicy',
                env,
                model_class,
                verbose=1,
                n_sampled_goal=4,
                goal_selection_strategy='future',
                buffer_size=int(1e5),
                batch_size=128,
                tensorboard_log="./ddpg_dvrk_tensorboard/",
                **rl_model_kwargs)
    # Reset the model
    env.reset()
    # Train the model
    model.learn(4000000,
                log_interval=100,
                callback=CheckpointCallback(
                    save_freq=100000, save_path="./ddpg_dvrk_tensorboard/"))
    model.save("./her_robot_env")
def main(load_policy=False):

    global log_dir, log_dir_policy
    if (load_policy):
          log_dir_policy = '../policies/PUSHING_TD3+HER_FIXED_POSITION_DYN_RAND_FROM_FIXED_PHYSICS'
    model_class = TD3  # works also with SAC and DDPG
    action_space = 7
    fixed = True
    normalize_observations = False
    gamma = 0.9
    memory_limit = 1000000
    normalize_returns = True
    timesteps = 1500000
    discreteAction = 0
    rend = False
    env = pandaPushGymEnvHERRand(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=0,
            isDiscrete=discreteAction, action_space = action_space,
            fixedPositionObj = fixed, includeVelObs = True)


    env = Monitor(env, log_dir, allow_early_resets=True)
    # Available strategies (cf paper): future, final, episode, random
    goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))
    # Wrap the model

    model = HER(CustomPolicy, env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy,
                verbose=1,tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/TD3+HER_FIXED_DYN_RAND", buffer_size=1000000,batch_size=256,
                random_exploration=0.3, action_noise=action_noise)

    if (load_policy):
        model = HER.load("../policies/USEFUL_POLICIES/PUSHING_TD3+HER_FIXED_POSITIONbest_model.pkl", env=env, n_sampled_goal=4,
        goal_selection_strategy=goal_selection_strategy,
        tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/TD3+HER_FIXED_DYN_RAND_FROM_FIXED_PHYSICS",
        buffer_size=1000000,batch_size=256,random_exploration=0.3, action_noise=action_noise)

    # Train the model starting from a previous policy
    model.learn(timesteps, callback = callback )
    model.save("../policies/PUSHING_FIXED_TD3_DYN_RAND")
    print("Finished train1")
def main(load_policy=False):
    global log_dir
    model_class = TD3  # works also with SAC and DDPG
    action_space = 6
    fixed = True
    #0 completely fixed, 1 slightly random radius, 2 big random radius,
    object_position = 1
    normalize_observations = False
    gamma = 0.9
    memory_limit = 1000000
    normalize_returns = True
    timesteps = 5000000
    discreteAction = 0
    rend = False

    env = pandaPushGymEnvHER(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=1,
            isDiscrete=discreteAction, action_space = action_space,
            fixedPositionObj = fixed, includeVelObs = True, object_position=object_position)

    env = Monitor(env, log_dir, allow_early_resets=True)

    goal_selection_strategy = 'future'
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))
    # Wrap the model

    model = HER(CustomTD3Policy, env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy,
                verbose=1,tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/PUSHING_TD3+HER_FIXED_POSITION_PHASE_1_IK", buffer_size=1000000,batch_size=256,
                random_exploration=0.3, action_noise=action_noise)

    if (load_policy):
        model = HER.load("../policies/USEFUL_POLICIES/PUSHING_TD3+HER_FIXED_POSITIONbest_model.pkl", env=env, n_sampled_goal=4,
        goal_selection_strategy=goal_selection_strategy,
        tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/PUSHING_TD3+HER_FIXED_POSITION_PHASE_1_IK",
        buffer_size=1000000,batch_size=256,random_exploration=0.3, action_noise=action_noise)

    model.learn(timesteps,log_interval=100, callback = callback)
    print("Saving Policy PHASE_1")
    model.save("../policies/PUSHING_TD3+HER_FIXED_POSITION_PHASE_1_IK")
Beispiel #14
0
            policy_kwargs=dict(layers=[256, 256, 256]))

# DDPG Hyperparams:
# NOTE: it works even without action noise
# n_actions = env.action_space.shape[0]
# noise_std = 0.2
# action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions))
# model = HER('MlpPolicy', env, DDPG, n_sampled_goal=n_sampled_goal,
#             goal_selection_strategy='future',
#             verbose=1, buffer_size=int(1e6),
#             actor_lr=1e-3, critic_lr=1e-3, action_noise=action_noise,
#             gamma=0.95, batch_size=256,
#             policy_kwargs=dict(layers=[256, 256, 256]))

model.learn(int(2e5))
model.save('her_sac_highway')

# Load saved model
model = HER.load('her_sac_highway', env=env)

obs = env.reset()

# Evaluate the agent
episode_reward = 0
for _ in range(100):
    action, _ = model.predict(obs)
    obs, reward, done, info = env.step(action)
    env.render()
    episode_reward += reward
    if done or info.get('is_success', False):
        print("Reward:", episode_reward, "Success?",
Beispiel #15
0
    'param_noise': None,
    'action_noise': action_noise,
    'normalize_observations': normalize,
    'nb_train_steps': nb_train_steps,
    'nb_rollout_steps': nb_rollout_steps,
    'batch_size': batch_size,
    'critic_l2_reg': critic_l2_reg,
    'buffer_size': buffer_size,
    'random_exploration': random_exploration,
    'policy_kwargs': {
        'layer_norm': True
    },
    'logging': suff
}
model = HER('MlpPolicy', env, DDPG, **kwargs)
start = time.time()

model.learn(total_timesteps=total_timesteps, log_interval=1)

if log:
    model.save("pkl/{}".format(suff))
    print(
        "Saved as {0}, trained {1} primitive policy for {2} timesteps in {3}".
        format(suff, policy, total_timesteps,
               time.strftime('%H:%M:%S', time.gmtime(time.time() - start))))

else:
    print("Trained {0} primitive policy for {1} timesteps in {2}".format(
        policy, total_timesteps,
        time.strftime('%H:%M:%S', time.gmtime(time.time() - start))))
Beispiel #16
0
# Available strategies (cf paper): future, final, episode, random
goal_selection_strategy = 'future'  # equivalent to GoalSelectionStrategy.FUTURE

# Wrap the model
model = HER('MlpPolicy',
            env,
            model_class,
            n_sampled_goal=4,
            goal_selection_strategy=goal_selection_strategy,
            verbose=1)

# Train the model
model.learn(500000)

model.save("her_fetch_reach_env")

# def policy(observation, desired_goal):
#     # Here you would implement your smarter policy. In this case,
#     # we just sample random actions.
#     return env.action_space.sample()

# while not done:
#     env.render()
#     action = policy(obs['observation'], obs['desired_goal'])
#     obs, reward, done, info = env.step(action)

#     # If we want, we can substitute a goal here and re-compute
#     # the reward. For instance, we can just pretend that the desired
#     # goal was what we achieved all along.
#     substitute_goal = obs['achieved_goal'].copy()
Beispiel #17
0
# Available strategies (cf paper): future, final, episode, random
goal_selection_strategy = 'future'  # equivalent to GoalSelectionStrategy.FUTURE

# Wrap the model
model = HER('MlpPolicy',
            env,
            model_class,
            n_sampled_goal=4,
            goal_selection_strategy=goal_selection_strategy,
            verbose=1)

# Train the model
model.learn(100000)

model.save("fetch_reach_sac_her_future_100000")

# def policy(observation, desired_goal):
#     # Here you would implement your smarter policy. In this case,
#     # we just sample random actions.
#     return env.action_space.sample()

# while not done:
#     env.render()
#     action = policy(obs['observation'], obs['desired_goal'])
#     obs, reward, done, info = env.step(action)

#     # If we want, we can substitute a goal here and re-compute
#     # the reward. For instance, we can just pretend that the desired
#     # goal was what we achieved all along.
#     substitute_goal = obs['achieved_goal'].copy()
Beispiel #18
0
def callback(_locals, _globals):
    global n_steps

    n_steps += 1
    if n_steps % 50000 == 0 or n_steps == 10000:
        print('Saving: ', n_steps)
        save_path = 'checkpoints/yumi/her/her_{}_task_{}_{}.npy'.format(
            name, args.task, n_steps)
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        model.save(save_path)

    return True


model = HER('MlpPolicy',
            env,
            model_class=DDPG,
            verbose=1,
            tensorboard_log=log_dir,
            **dict(random_exploration=.2))
model.learn(total_timesteps=total_timesteps, callback=callback)
model.save("her-yumi-{}-final".format(n_steps))

env.save_running_average(log_dir)

obs = env.reset()
for i in range(100):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
Beispiel #19
0
# Available strategies (cf paper): future, final, episode, random
goal_selection_strategy = 'future'  # equivalent to GoalSelectionStrategy.FUTURE

# Wrap the model
model = HER('MlpPolicy',
            env,
            model_class,
            n_sampled_goal=4,
            goal_selection_strategy=goal_selection_strategy,
            verbose=1)

# Train the model
model.learn(100000)

model.save("hand_egg_DDPG_her_random_100000")

# def policy(observation, desired_goal):
#     # Here you would implement your smarter policy. In this case,
#     # we just sample random actions.
#     return env.action_space.sample()

# while not done:
#     env.render()
#     action = policy(obs['observation'], obs['desired_goal'])
#     obs, reward, done, info = env.step(action)

#     # If we want, we can substitute a goal here and re-compute
#     # the reward. For instance, we can just pretend that the desired
#     # goal was what we achieved all along.
#     substitute_goal = obs['achieved_goal'].copy()
Beispiel #20
0
env = gym.make('PointMass-%d-v1' % num_objs)
n_actions = env.action_space.shape[-1]
stddev = 0.2
action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                 sigma=0.01 * np.ones(n_actions))

policy = 'MlpPolicy'

args_alg = dict(
    random_exploration=0.2,
    buffer_size=int(1E6),
    batch_size=256,
    nb_eval_steps=10,
    action_noise=action_noise,
    tensorboard_log=logger,
)

model = HER(policy,
            env,
            model_class,
            n_sampled_goal=4,
            goal_selection_strategy=goal_selection_strategy,
            verbose=1,
            **args_alg)
model.learn(int(nIter))
model.save(expDir + "/%s" % np.format_float_scientific(nIter))
#model = HER.load("point1_deter", env=env)

record_her_indep(env, model, expDir, num_files=10, video_len=500)
Beispiel #21
0
        lambda: env
    ])  # The algorithms require a vectorized environment to run
    num_env = 2

    #env = SubprocVecEnv([make_env(env_id, log_dir, i+worker_id) for i in range(num_env)])
    model_class = DQN
    # Available strategies (cf paper): future, final, episode, random
    goal_selection_strategy = 'future'  # equivalent to GoalSelectionStrategy.FUTURE
    model = HER('MlpPolicy',
                env,
                model_class,
                n_sampled_goal=4,
                goal_selection_strategy=goal_selection_strategy,
                verbose=1)
    model.learn(total_timesteps=1000)
    model.save(log_dir + "model")

    # WARNING: you must pass an env
    # or wrap your environment with HERGoalEnvWrapper to use the predict method
    model = HER.load(log_dir + "model", env=env)

    #evaluate agent
    episodes = 100
    ep_r = []
    ep_l = []
    for e in range(episodes):
        obs = env.reset()
        total_r = 0.
        total_l = 0.
        while True:
            action, _states = model.predict(obs)
Beispiel #22
0
                goal_selection_strategy=args.goal_selection_strategy,
                verbose=1,
                exploration_fraction=args.exploration_fraction,
                tensorboard_log=args.tensorboard_log_path + '/' + args.name)
else:
    model = DQN(MlpPolicy,
                env,
                verbose=1,
                tensorboard_log='/srv/share/nkannabiran3/DQN/',
                double_q=True,
                prioritized_replay=True,
                prioritized_replay_alpha=0.8,
                prioritized_replay_beta0=0.2)
print('learning')
os.mkdir(args.tensorboard_log_path + '/' + args.name)
parser.save_args()
model.learn(total_timesteps=args.num_training_steps,
            tb_log_name=args.tensorboard_log_path + '/' + args.name)
model.save(args.name)

# del model # remove to demonstrate saving and loading

# model = DQN.load("deepq_cartpole")

obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    if dones:
        break
    # env.render()
    def get_goal(self):
        """Get a random goal depending on the difficulty."""
        self.difficulty = np.random.choice(self.difficulties)
        return move_cube.sample_goal(difficulty=self.difficulty)


if __name__ == "__main__":

    difficulties = np.arange(1, 5)
    env = gym.make(
        'Example_CubeEnv-v0',
        initializer=Initializer(difficulty=difficulties),
        action_type=cube_env.ActionType.POSITION,
        visualization=False,
    )

    model_kwargs = {
        'ent_coef': 'auto',
        'buffer_size': int(1e6),
        'gamma': 0.95,
        'learning_starts': 1000,
        'train_freq': 1
    }

    model = HER('MlpPolicy', env, SAC, verbose=True, **model_kwargs)

    model.learn(int(8e6))
    model.save("./hersac_CubeEnv_diffall")

    os.system("shutdown now")
Beispiel #24
0
import gym

from stable_baselines.common.policies import MlpPolicy
from stable_baselines import DDPG, HER, TD3, SAC

env = gym.make('FetchReach-v1')

#model = DDPG('MlpPolicy', env)
model = HER('MlpPolicy',
            env,
            DDPG,
            goal_selection_strategy='final',
            n_sampled_goal=4)
model.learn(50000000)
model.save('./her_fetch_reach')

#model = HER.load('./her_fetch_reach', env=env)

for _ in range(100):
    obs = env.reset()
    state = None
    done = False
    _reward = 0

    while not done:
        env.render()
        action, state = model.predict(obs)
        obs, reward, done, info = env.step(action)
        _reward += reward

    print("Reward = {}".format(_reward))
Beispiel #25
0
env, _ = load_env(env_name,
                  core_dir=core_dir,
                  envs_dir=envs_dir,
                  xmls_dir=xmls_dir,
                  return_args_remaining=True)
# Available strategies (cf paper): future, final, episode, random
goal_selection_strategy = 'future'  # equivalent to GoalSelectionStrategy.FUTURE

# Wrap the model
model = HER('MlpPolicy',
            env,
            model_class,
            n_sampled_goal=4,
            goal_selection_strategy=goal_selection_strategy,
            verbose=1)
# Train the model
model.learn(1000)

model.save("./her_bit_env")

# WARNING: you must pass an env
# or wrap your environment with HERGoalEnvWrapper to use the predict method
model = HER.load('./her_bit_env', env=env)

obs = env.reset()
for _ in range(100):
    action, _ = model.predict(obs)
    obs, reward, done, _ = env.step(action)

    if done:
        obs = env.reset()
# Available strategies (cf paper): future, final, episode, random
goal_selection_strategy = 'random'  # equivalent to GoalSelectionStrategy.FUTURE

# Wrap the model
model = HER('MlpPolicy',
            env,
            model_class,
            n_sampled_goal=4,
            goal_selection_strategy=goal_selection_strategy,
            verbose=1)

# Train the model
model.learn(10000)

model.save("hand_reach_sac_her_random_10000")

# def policy(observation, desired_goal):
#     # Here you would implement your smarter policy. In this case,
#     # we just sample random actions.
#     return env.action_space.sample()

# while not done:
#     env.render()
#     action = policy(obs['observation'], obs['desired_goal'])
#     obs, reward, done, info = env.step(action)

#     # If we want, we can substitute a goal here and re-compute
#     # the reward. For instance, we can just pretend that the desired
#     # goal was what we achieved all along.
#     substitute_goal = obs['achieved_goal'].copy()
            goal_selection_strategy='future',
            verbose=1,
            buffer_size=int(1e6),
            learning_rate=0.001,
            gamma=0.95,
            batch_size=256,
            ent_coef='auto',
            random_exploration=0.3,
            learning_starts=1000,
            train_freq=1,
            policy_kwargs=dict(layers=[256, 256, 256]),
            tensorboard_log="./OpenAI/")
# Train the model
model.learn(int(8e6))

model.save("./model2")

# WARNING: you must pass an env
# or wrap your environment with HERGoalEnvWrapper to use the predict method
model = HER.load('./model2', env=env)

obs = env.reset()
episodes = 0
successes = 0
step = 0
while (episodes < 50):
    step += 1
    action, _ = model.predict(obs)
    obs, reward, done, _ = env.step(action)
    env.render()
    if done or step > 1000:
Beispiel #28
0
goal_selection_strategy = 'future' 


n_actions = env.action_space.shape[-1]
stddev = 0.2
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.01 * np.ones(n_actions))
policy_kwargs = {}

args_alg = dict(
	random_exploration=0.3,
	buffer_size=int(1E6), 
	batch_size=256,
	nb_eval_steps=10, 
	#actor_lr=1e-3, 
	action_noise=action_noise,
	policy_kwargs = policy_kwargs
)

model = HER('MlpPolicy', env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy,
                                                verbose=1, **args_alg)
# Train the model
model.learn(50000)

model.save("fetch_trial")

# WARNING: you must pass an env
# or wrap your environment with HERGoalEnvWrapper to use the predict method

#model = HER.load('fetch_trial.zip', env=env)
dirs = os.getcwd()
record_her_indep(env, model, dirs)
def launchAgent():
    import Reinforcement_AI.env.d_image_env as image_env
    from stable_baselines import DQN, HER, DDPG, PPO2
    from stable_baselines.common import make_vec_env

    model_name = "PPO2"

    if model_name == "HER":
        model = HER(
            "CnnPolicy",
            env=image_env.DetailedMiniMapEnv(),
            model_class=DQN
        )
    if model_name == "DDPG":
        model = DDPG(
            policy="CnnPolicy",
            env=image_env.DDPGImageEnv(),
            normalize_observations=True
        )
    if model_name == "PPO2":
        # env = image_env.DetailedMiniMapEnv()
        env = make_vec_env(image_env.DetailedMiniMapEnv, n_envs=1)
        model = PPO2(
            policy="CnnPolicy",
            env=env,
            verbose=1
        )
    else:
        model = DQN(
            "CnnPolicy",  # policy
            env=image_env.DetailedMiniMapEnv(),  # environment
            double_q=True,  # Double Q enable
            prioritized_replay=True,  # Replay buffer enabled
            verbose=0  # log print
        )

    for i in range(1000):
        if i != 0:
            if model_name == "HER":
                model = HER.load("detailedmap_HER_" + str(i))
                model.set_env(image_env.DetailedMiniMapEnv())
            if model_name == "DDPG":
                model = DDPG.load("detailedmap_DDPG_" + str(i))
                model.set_env(image_env.DDPGImageEnv())
            if model_name == "PPO2":
                # print('set env')
                # ppo2_env = make_vec_env(image_env.DetailedMiniMapEnv, n_envs=1)
                # print('get model')
                model = PPO2.load("detailedmap_PPO2_" + str(i), env)
                # print('set model env')
                # model.set_env(ppo2_env)
            else:
                model = DQN.load("detailedmap_DQN_" + str(i))
                model.set_env(image_env.DetailedMiniMapEnv())

        # print('model learn start')
        model.learn(total_timesteps=3900)
        # print('model learn finished')

        # print('model save start')
        model.save("detailedmap_" + model_name + "_" + str(i+1))
        del model
Beispiel #30
0
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2, DQN, HER, DDPG

import synergyenvs

env = gym.make("GraspBoxPybullet-v0")
env.render()
o = env.reset()

# model = PPO2(MlpPolicy, env, verbose=1)
model = HER('MlpPolicy', env, DDPG, n_sampled_goal=4, verbose=1)
model.learn(50000)

model.save("./her_graspbox-1")

env.camera_adjust()

for _ in range(1000):
    o = env.reset()
    env.render()
    action, _states = model.predict(o)
    # action = env.action_space.sample()
    o, r, done, info = env.step(action)
    print(o, r, done, info)
    if done:
        o = env.reset()
    time.sleep(0.2)

env.close()