Exemple #1
0
def main(load_policy=True):
    global log_dir
    model_class = TD3  # works also with SAC and DDPG
    action_space = 6
    gamma = 0.9
    memory_limit = 1000000
    timesteps = 15000000
    discreteAction = 0
    rend = False
    # learning rate


    env = bioEnv()
  
    env = Monitor(env, log_dir, allow_early_resets=True)

    goal_selection_strategy = 'future'
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))
    # Wrap the model

    model = HER(CustomTD3Policy, env, model_class,n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy,
                verbose=1,tensorboard_log="../pybullet_logs/bioEnv_TD3", buffer_size=1000000,batch_size= 256,
                random_exploration=0.3, action_noise=action_noise)
    
    if (load_policy):
        model = HER.load("models/TD3/curriculum/best_model_part_11_10g_TRUE.pkl", env=env, n_sampled_goal=4,
        goal_selection_strategy=goal_selection_strategy,
        tensorboard_log="../pybullet_logs/bioEnv_TD3",
        buffer_size=1000000,batch_size=256,random_exploration=0.3, action_noise=action_noise)
    
    model.learn(timesteps,log_interval=100, callback = callback)
   
    model.save("policy_TD3_Discr")
Exemple #2
0
def train_policy(num_of_envs, log_relative_path, maximum_episode_length,
                 skip_frame, seed_num, her_config, total_time_steps,
                 validate_every_timesteps, task_name):
    task = generate_task(task_generator_id=task_name,
                         dense_reward_weights=np.array([100000, 0, 0, 0]),
                         fractional_reward_weight=0)
    env = CausalWorld(task=task,
                      skip_frame=skip_frame,
                      enable_visualization=False,
                      seed=seed_num,
                      max_episode_length=maximum_episode_length)
    env = HERGoalEnvWrapper(env)
    env = CurriculumWrapper(
        env,
        intervention_actors=[GoalInterventionActorPolicy()],
        actives=[(0, 1000000000, 1, 0)])
    set_global_seeds(seed_num)
    checkpoint_callback = CheckpointCallback(save_freq=int(
        validate_every_timesteps / num_of_envs),
                                             save_path=log_relative_path,
                                             name_prefix='model')
    model = HER(MlpPolicy,
                env,
                SAC,
                verbose=1,
                policy_kwargs=dict(layers=[256, 256, 256]),
                **her_config,
                seed=seed_num)
    model.learn(total_timesteps=total_time_steps,
                tb_log_name="her_sac",
                callback=checkpoint_callback)
    return
Exemple #3
0
def train_policy(num_of_envs, log_relative_path, maximum_episode_length,
                 skip_frame, seed_num, sac_config, total_time_steps,
                 validate_every_timesteps, task_name):
    def _make_env(rank):
        def _init():
            task = generate_task(task_generator_id=task_name)
            env = CausalWorld(task=task,
                              skip_frame=skip_frame,
                              enable_visualization=False,
                              seed=seed_num + rank,
                              max_episode_length=maximum_episode_length)
            env = HERGoalEnvWrapper(env)
            return env

        set_global_seeds(seed_num)
        return _init

    os.makedirs(log_relative_path)
    env = SubprocVecEnv([_make_env(rank=i) for i in range(num_of_envs)])
    model = HER('MlpPolicy',
                env,
                SAC,
                verbose=1,
                policy_kwargs=dict(layers=[256, 256, 256]),
                **sac_config)
    save_config_file(sac_config,
                     _make_env(0)(),
                     os.path.join(log_relative_path, 'config.json'))
    for i in range(int(total_time_steps / validate_every_timesteps)):
        model.learn(total_timesteps=validate_every_timesteps,
                    tb_log_name="sac",
                    reset_num_timesteps=False)
    model.save(os.path.join(log_relative_path, 'saved_model'))
    return
Exemple #4
0
def main(argv):

    numControlledJoints = 6
    fixed = False
    normalize_observations = False
    gamma = 0.9
    batch_size = 16
    memory_limit = 1000000
    normalize_returns = True
    timesteps = 1000000
    policy_name = "reaching_policy"
    discreteAction = 0
    rend = False

    kukaenv = kukaReachGymEnvHer(urdfRoot=robot_data.getDataPath(),
                                 renders=rend,
                                 useIK=0,
                                 isDiscrete=discreteAction,
                                 numControlledJoints=numControlledJoints,
                                 fixedPositionObj=fixed,
                                 includeVelObs=True)
    kukaenv = Monitor(kukaenv, log_dir, allow_early_resets=True)

    n_actions = kukaenv.action_space.shape[-1]
    param_noise = None
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(0.5) *
                                                np.ones(n_actions))
    model_class = DDPG
    goal_selection_strategy = 'future'
    model = HER(CustomPolicy,
                kukaenv,
                model_class,
                n_sampled_goal=4,
                goal_selection_strategy=goal_selection_strategy,
                verbose=1,
                tensorboard_log=
                "../pybullet_logs/kuka_reach_ddpg/reaching_DDPG_HER_PHASE",
                buffer_size=1000000,
                batch_size=64,
                random_exploration=0.3,
                action_noise=action_noise)

    print(colored("-----Timesteps:", "red"))
    print(colored(timesteps, "red"))
    print(colored("-----Number Joints Controlled:", "red"))
    print(colored(numControlledJoints, "red"))
    print(colored("-----Object Position Fixed:", "red"))
    print(colored(fixed, "red"))
    print(colored("-----Policy Name:", "red"))
    print(colored(policy_name, "red"))
    print(colored("------", "red"))
    print(colored("Launch the script with -h for further info", "red"))

    model.learn(total_timesteps=timesteps, log_interval=100, callback=callback)

    print("Saving model to kuka.pkl")
    model.save("../pybullet_logs/kukareach_ddpg_her/" + policy_name)

    del model  # remove to demonstrate saving and loading
def test_long_episode(model_class):
    """
    Check that the model does not break when the replay buffer is still empty
    after the first rollout (because the episode is not over).
    """
    # n_bits > nb_rollout_steps
    n_bits = 10
    env = BitFlippingEnv(n_bits,
                         continuous=model_class in [DDPG, SAC, TD3],
                         max_steps=n_bits)
    kwargs = {}
    if model_class == DDPG:
        kwargs['nb_rollout_steps'] = 9  # < n_bits
    elif model_class in [DQN, SAC, TD3]:
        kwargs['batch_size'] = 8  # < n_bits
        kwargs['learning_starts'] = 0

    model = HER('MlpPolicy',
                env,
                model_class,
                n_sampled_goal=4,
                goal_selection_strategy='future',
                verbose=0,
                **kwargs)
    model.learn(100)
Exemple #6
0
def main(
  training_env: PSMCartesianHERDDPGEnv,
  eval_env: PSMCartesianHERDDPGEnv = None,
  log_dir='./.logs/results'
):

  os.makedirs(log_dir, exist_ok=True)

  # training_env = Monitor(training_env, log_dir)

  n_actions = training_env.action_space.shape[0]
  noise_std = 0.2
  # Currently using OU noise
  action_noise = OrnsteinUhlenbeckActionNoise(
    mean=np.zeros(n_actions),
    sigma=noise_std * np.ones(n_actions)
  )
  model_class = DDPG  # works also with SAC, DDPG and TD3

  rl_model_kwargs = {
    'actor_lr': 1e-3,
    'critic_lr': 1e-3,
    'action_noise': action_noise,
    'nb_train_steps': 300,
    'nb_rollout_steps': 100,
    'gamma': 0.95,
    'observation_range': (-1.5,
                          1.5),
    'random_exploration': 0.05,
    'normalize_observations': True,
    'critic_l2_reg': 0.01
  }

  # Available strategies (cf paper): future, final, episode, random
  model = HER(
    'MlpPolicy',
    training_env,
    model_class,
    verbose=1,
    n_sampled_goal=4,
    goal_selection_strategy='future',
    buffer_size=int(1e5),
    batch_size=128,
    tensorboard_log="./ddpg_dvrk_tensorboard/",
    **rl_model_kwargs
  )
  # Reset the model
  training_env.reset()
  # Create callbacks
  checkpoint_callback = CheckpointCallback(
    save_freq=100000,
    save_path="./ddpg_dvrk_tensorboard/"
  )  # save_path="./.model/model_checkpoint/") #save_freq=100000
  # eval_callback = EvalCallback(training_env, best_model_save_path='./ddpg_dvrk_tensorboard/best_model',
  #                             log_path=log_dir, eval_freq=500)
  callback = CallbackList([checkpoint_callback])  # , eval_callback])
  # Train the model
  model.learn(4000000, log_interval=100, callback=callback)
  model.save("./her_robot_env")
Exemple #7
0
def test_model_manipulation(model_class, goal_selection_strategy):
    env = BitFlippingEnv(N_BITS,
                         continuous=model_class in [DDPG, SAC],
                         max_steps=N_BITS)
    env = DummyVecEnv([lambda: env])

    model = HER('MlpPolicy',
                env,
                model_class,
                n_sampled_goal=3,
                goal_selection_strategy=goal_selection_strategy,
                verbose=0)
    model.learn(1000)

    model_predict(model, env, n_steps=100, additional_check=None)

    model.save('./test_her')
    del model

    # NOTE: HER does not support VecEnvWrapper yet
    with pytest.raises(AssertionError):
        model = HER.load('./test_her', env=VecNormalize(env))

    model = HER.load('./test_her')

    # Check that the model raises an error when the env
    # is not wrapped (or no env passed to the model)
    with pytest.raises(ValueError):
        model.predict(env.reset())

    env_ = BitFlippingEnv(N_BITS,
                          continuous=model_class in [DDPG, SAC],
                          max_steps=N_BITS)
    env_ = HERGoalEnvWrapper(env_)

    model_predict(model, env_, n_steps=100, additional_check=None)

    model.set_env(env)
    model.learn(1000)

    model_predict(model, env_, n_steps=100, additional_check=None)

    assert model.n_sampled_goal == 3

    del model

    env = BitFlippingEnv(N_BITS,
                         continuous=model_class in [DDPG, SAC],
                         max_steps=N_BITS)
    model = HER.load('./test_her', env=env)
    model.learn(1000)

    model_predict(model, env_, n_steps=100, additional_check=None)

    assert model.n_sampled_goal == 3

    if os.path.isfile('./test_her.pkl'):
        os.remove('./test_her.pkl')
def main(load_policy=False):
    global log_dir
    model_class = TD3  # works also with SAC and DDPG
    action_space = 7
    normalize_observations = False
    gamma = 0.9
    memory_limit = 1000000
    normalize_returns = True
    timesteps = 8000000
    rend = False

    obj_pose_rnd_std = 0

    env = pandaPushGymGoalEnv(renders=rend,
                              use_IK=0,
                              numControlledJoints=action_space,
                              obj_pose_rnd_std=obj_pose_rnd_std,
                              includeVelObs=True)

    env = Monitor(env, log_dir, allow_early_resets=True)

    goal_selection_strategy = 'future'
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(0.5) *
                                                np.ones(n_actions))
    # Wrap the model

    model = HER(
        CustomTD3Policy,
        env,
        model_class,
        n_sampled_goal=4,
        goal_selection_strategy=goal_selection_strategy,
        verbose=1,
        tensorboard_log=
        "../pybullet_logs/panda_push_TD3/stable_baselines/TD3_phase1_target_fixed",
        buffer_size=1000000,
        batch_size=256,
        random_exploration=0.3,
        action_noise=action_noise)

    if (load_policy):
        model = HER.load(
            "../policies/USEFUL_POLICIES/PUSHING_TD3+HER_FIXED_POSITIONbest_model.pkl",
            env=env,
            n_sampled_goal=4,
            goal_selection_strategy=goal_selection_strategy,
            tensorboard_log=
            "../pybullet_logs/panda_push_TD3/stable_baselines/TD3_phase1_target_fixed",
            buffer_size=1000000,
            batch_size=256,
            random_exploration=0.3,
            action_noise=action_noise)

    model.learn(timesteps, log_interval=100, callback=callback)
    print("Saving Policy PHASE_1")
    model.save("../policies/TD3_phase1_target_fixed")
Exemple #9
0
def test_her(model_class, goal_selection_strategy, discrete_obs_space):
    env = BitFlippingEnv(N_BITS, continuous=model_class in [DDPG, SAC, TD3],
                         max_steps=N_BITS, discrete_obs_space=discrete_obs_space)

    # Take random actions 10% of the time
    kwargs = {'random_exploration': 0.1} if model_class in [DDPG, SAC, TD3] else {}
    model = HER('MlpPolicy', env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy,
                verbose=0, **kwargs)
    model.learn(1000)
Exemple #10
0
def main():
    model_class = DDPG  # works also with SAC and DDPG

    # -j
    action_space = 7
    # -p
    fixed = True
    # -o
    normalize_observations = False
    # -g
    gamma = 0.9
    # -b
    #batch_size = 16
    # -m
    memory_limit = 1000000
    # -r
    normalize_returns = True
    # -t
    timesteps = 1000000
    policy_name = "pushing_policy"
    discreteAction = 0
    rend = False
    env = pandaPushGymEnvHERRand(urdfRoot=robot_data.getDataPath(),
                                 renders=rend,
                                 useIK=0,
                                 isDiscrete=discreteAction,
                                 action_space=action_space,
                                 fixedPositionObj=fixed,
                                 includeVelObs=True)

    # Available strategies (cf paper): future, final, episode, random
    goal_selection_strategy = 'future'  # equivalent to GoalSelectionStrategy.FUTURE
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=float(0.5) *
                                                np.ones(n_actions))
    # Wrap the model

    model = HER(
        CustomPolicy,
        env,
        model_class,
        n_sampled_goal=4,
        goal_selection_strategy=goal_selection_strategy,
        verbose=1,
        tensorboard_log=
        "../pybullet_logs/panda_push_ddpg/stable_baselines/DDPG+HER_FIXED_DYN_RAND",
        buffer_size=1000000,
        batch_size=256,
        random_exploration=0.3,
        action_noise=action_noise)

    # Train the model starting from a previous policy
    model.learn(timesteps)
    print("Saving Policy")
    model.save("../policies/pushing_fixed_HER_Dyn_Rand")
def launchAgent(model_name: str):
    """
    :param model_name: 실행시킬 모델의 종류. HER, DDPG, PPO2 혹은 기타값(DQN)이어야 함
                        현재는 의도상 PPO2로 세팅할 것
    :return: 1000회의 사이클을 돌고 난 이후의 모델
    """
    import Reinforcement_AI.env.e_enhanced_image_env as image_env
    from stable_baselines import DQN, HER, DDPG, PPO2
    from stable_baselines.common import make_vec_env

    print("Current Env is " + model_name)

    if model_name == "HER":
        env = image_env.DetailedMiniMapEnv()
        model = HER("CnnPolicy", env=env, model_class=DQN)
    if model_name == "DDPG":
        env = image_env.DDPGImageEnv()
        model = DDPG(policy="CnnPolicy", env=env, normalize_observations=True)
    if model_name == "PPO2":
        env = make_vec_env(image_env.DetailedMiniMapEnv, n_envs=1)
        model = PPO2(policy="CnnPolicy", env=env, verbose=1)
    else:
        env = image_env.DetailedMiniMapEnv()
        model = DQN(
            "CnnPolicy",  # policy
            env=env,  # environment
            double_q=True,  # Double Q enable
            prioritized_replay=True,  # Replay buffer enabled
            verbose=0  # log print
        )

    for i in range(1000):
        if i != 0:
            if model_name == "HER":
                model = HER.load("detailedmap_HER_" + str(i), env)
            if model_name == "DDPG":
                model = DDPG.load("detailedmap_DDPG_" + str(i), env)
            if model_name == "PPO2":
                model = PPO2.load("detailedmap_PPO2_" + str(i), env)
            else:
                model = DQN.load("detailedmap_DQN_" + str(i), env)

        # print('model learn start')
        model.learn(total_timesteps=12500)  #FPS가 130이상 넘어갈때의 최소수치
        print("this model is : detailedmap_" + model_name + "_" + str(i + 1))
        # print('model learn finished')

        # print('model save start')
        model.save("detailedmap_" + model_name + "_" + str(i + 1))
        del model
        # print('model save end')

    return model
def heralgorithm():

    goal_selection_strategy = 'future'  # equivalent to GoalSelectionStrategy.FUTURE

    # Wrap the model
    model = HER('MlpPolicy',
                env1,
                DDPG,
                n_sampled_goal=4,
                goal_selection_strategy=goal_selection_strategy,
                verbose=1)
    # Train the model
    model.learn(1000)

    model.save("./her_bit_env")
Exemple #13
0
def train_HER(env_train, model_name, timesteps=50000):
    start = time.time()
    n_sampled_goal = 4
    goal_selection_strategy = 'future'
    model = HER('MlpPolicy',
                env_train,
                model_class=SAC,
                verbose=0,
                n_sampled_goal=n_sampled_goal,
                goal_selection_strategy=goal_selection_strategy)
    model.learn(total_timesteps=timesteps)
    end = time.time()

    model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}")
    print('Training time (HER): ', (end - start) / 60, ' minutes')
    return model
Exemple #14
0
def main(env):

    n_actions = env.action_space.shape[0]
    noise_std = 0.2
    # Currently using OU noise
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                sigma=noise_std *
                                                np.ones(n_actions))
    model_class = DDPG  # works also with SAC, DDPG and TD3

    rl_model_kwargs = {
        'actor_lr': 1e-3,
        'critic_lr': 1e-3,
        'action_noise': action_noise,
        'nb_train_steps': 300,
        'nb_rollout_steps': 100,
        'gamma': 0.95,
        'observation_range': (-1.5, 1.5),
        'random_exploration': 0.05,
        'normalize_observations': True,
        'critic_l2_reg': 0.01
    }

    # Available strategies (cf paper): future, final, episode, random
    model = HER('MlpPolicy',
                env,
                model_class,
                verbose=1,
                n_sampled_goal=4,
                goal_selection_strategy='future',
                buffer_size=int(1e5),
                batch_size=128,
                tensorboard_log="./ddpg_dvrk_tensorboard/",
                **rl_model_kwargs)
    # Reset the model
    env.reset()
    # Train the model
    model.learn(4000000,
                log_interval=100,
                callback=CheckpointCallback(
                    save_freq=100000, save_path="./ddpg_dvrk_tensorboard/"))
    model.save("./her_robot_env")
def main(load_policy=False):

    global log_dir, log_dir_policy
    if (load_policy):
          log_dir_policy = '../policies/PUSHING_TD3+HER_FIXED_POSITION_DYN_RAND_FROM_FIXED_PHYSICS'
    model_class = TD3  # works also with SAC and DDPG
    action_space = 7
    fixed = True
    normalize_observations = False
    gamma = 0.9
    memory_limit = 1000000
    normalize_returns = True
    timesteps = 1500000
    discreteAction = 0
    rend = False
    env = pandaPushGymEnvHERRand(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=0,
            isDiscrete=discreteAction, action_space = action_space,
            fixedPositionObj = fixed, includeVelObs = True)


    env = Monitor(env, log_dir, allow_early_resets=True)
    # Available strategies (cf paper): future, final, episode, random
    goal_selection_strategy = 'future' # equivalent to GoalSelectionStrategy.FUTURE
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))
    # Wrap the model

    model = HER(CustomPolicy, env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy,
                verbose=1,tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/TD3+HER_FIXED_DYN_RAND", buffer_size=1000000,batch_size=256,
                random_exploration=0.3, action_noise=action_noise)

    if (load_policy):
        model = HER.load("../policies/USEFUL_POLICIES/PUSHING_TD3+HER_FIXED_POSITIONbest_model.pkl", env=env, n_sampled_goal=4,
        goal_selection_strategy=goal_selection_strategy,
        tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/TD3+HER_FIXED_DYN_RAND_FROM_FIXED_PHYSICS",
        buffer_size=1000000,batch_size=256,random_exploration=0.3, action_noise=action_noise)

    # Train the model starting from a previous policy
    model.learn(timesteps, callback = callback )
    model.save("../policies/PUSHING_FIXED_TD3_DYN_RAND")
    print("Finished train1")
def main(load_policy=False):
    global log_dir
    model_class = TD3  # works also with SAC and DDPG
    action_space = 6
    fixed = True
    #0 completely fixed, 1 slightly random radius, 2 big random radius,
    object_position = 1
    normalize_observations = False
    gamma = 0.9
    memory_limit = 1000000
    normalize_returns = True
    timesteps = 5000000
    discreteAction = 0
    rend = False

    env = pandaPushGymEnvHER(urdfRoot=robot_data.getDataPath(), renders=rend, useIK=1,
            isDiscrete=discreteAction, action_space = action_space,
            fixedPositionObj = fixed, includeVelObs = True, object_position=object_position)

    env = Monitor(env, log_dir, allow_early_resets=True)

    goal_selection_strategy = 'future'
    n_actions = env.action_space.shape[-1]
    action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))
    # Wrap the model

    model = HER(CustomTD3Policy, env, model_class, n_sampled_goal=4, goal_selection_strategy=goal_selection_strategy,
                verbose=1,tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/PUSHING_TD3+HER_FIXED_POSITION_PHASE_1_IK", buffer_size=1000000,batch_size=256,
                random_exploration=0.3, action_noise=action_noise)

    if (load_policy):
        model = HER.load("../policies/USEFUL_POLICIES/PUSHING_TD3+HER_FIXED_POSITIONbest_model.pkl", env=env, n_sampled_goal=4,
        goal_selection_strategy=goal_selection_strategy,
        tensorboard_log="../pybullet_logs/panda_push_TD3/stable_baselines/PUSHING_TD3+HER_FIXED_POSITION_PHASE_1_IK",
        buffer_size=1000000,batch_size=256,random_exploration=0.3, action_noise=action_noise)

    model.learn(timesteps,log_interval=100, callback = callback)
    print("Saving Policy PHASE_1")
    model.save("../policies/PUSHING_TD3+HER_FIXED_POSITION_PHASE_1_IK")
Exemple #17
0
 def __init__(self,
              env: ISettableGoalEnv,
              verbose=1,
              rank=0,
              experiment_name="her-sac"):
     self._env = env
     self._dirs = Dirs(
         experiment_name=f"{type(env).__name__}-{experiment_name}",
         rank=rank)
     options = {
         "env": env,
         "tensorboard_log": self._dirs.tensorboard,
         "model_class": SAC,
         "gamma": 1,
         "learning_rate": 3e-3
     }
     if os.path.isdir(self._dirs.models) and os.path.isfile(
             self._dirs.best_model):
         self._model = HER.load(load_path=self._dirs.best_model, **options)
         print(f"Loaded model {self._dirs.best_model}")
     else:
         self._model = HER(policy="MlpPolicy", verbose=verbose, **options)
    os.makedirs(model_path, exist_ok=True)

    set_global_seeds(0)
    num_of_active_envs = 1
    policy_kwargs = dict(layer=[256, 256])
    #env = gym.make("real_robot_challenge_phase_1-v1")
    env = FlatObservationWrapper(
        ExamplePushingTrainingEnv(frameskip=20, visualization=False))

    train_configs = {
        "gamma": 0.99,
        "n_steps": int(120000 / 20),
        "ent_coef": 0.01,
        "learning_rate": 0.00025,
        "vf_coef": 0.5,
        "max_grad_norm": 0.5,
        "nminibatches": 40,
        "noptepochs": 4,
    }

    model = HER(MlpPolicy, env, SAC, verbose=1, tensorboard_log=model_path)

    ckpt_frequency = int(validate_every_timesteps / num_of_active_envs)
    checkpoint_callback = CheckpointCallback(save_freq=ckpt_frequency,
                                             save_path=model_path,
                                             name_prefix="model")

    model.learn(int(total_time_steps), callback=checkpoint_callback)
    env.close()
def launchAgent():
    import Reinforcement_AI.env.d_image_env as image_env
    from stable_baselines import DQN, HER, DDPG, PPO2
    from stable_baselines.common import make_vec_env

    model_name = "PPO2"

    if model_name == "HER":
        model = HER(
            "CnnPolicy",
            env=image_env.DetailedMiniMapEnv(),
            model_class=DQN
        )
    if model_name == "DDPG":
        model = DDPG(
            policy="CnnPolicy",
            env=image_env.DDPGImageEnv(),
            normalize_observations=True
        )
    if model_name == "PPO2":
        # env = image_env.DetailedMiniMapEnv()
        env = make_vec_env(image_env.DetailedMiniMapEnv, n_envs=1)
        model = PPO2(
            policy="CnnPolicy",
            env=env,
            verbose=1
        )
    else:
        model = DQN(
            "CnnPolicy",  # policy
            env=image_env.DetailedMiniMapEnv(),  # environment
            double_q=True,  # Double Q enable
            prioritized_replay=True,  # Replay buffer enabled
            verbose=0  # log print
        )

    for i in range(1000):
        if i != 0:
            if model_name == "HER":
                model = HER.load("detailedmap_HER_" + str(i))
                model.set_env(image_env.DetailedMiniMapEnv())
            if model_name == "DDPG":
                model = DDPG.load("detailedmap_DDPG_" + str(i))
                model.set_env(image_env.DDPGImageEnv())
            if model_name == "PPO2":
                # print('set env')
                # ppo2_env = make_vec_env(image_env.DetailedMiniMapEnv, n_envs=1)
                # print('get model')
                model = PPO2.load("detailedmap_PPO2_" + str(i), env)
                # print('set model env')
                # model.set_env(ppo2_env)
            else:
                model = DQN.load("detailedmap_DQN_" + str(i))
                model.set_env(image_env.DetailedMiniMapEnv())

        # print('model learn start')
        model.learn(total_timesteps=3900)
        # print('model learn finished')

        # print('model save start')
        model.save("detailedmap_" + model_name + "_" + str(i+1))
        del model
import time

env = gym.make("FetchPickAndPlace-v1")

goal_selection_strategy = 'future'  # equivalent to GoalSelectionStrategy.FUTURE

# Wrap the model
model = HER('MlpPolicy',
            env,
            SAC,
            n_sampled_goal=4,
            goal_selection_strategy='future',
            verbose=1,
            buffer_size=int(1e6),
            learning_rate=0.001,
            gamma=0.95,
            batch_size=256,
            ent_coef='auto',
            random_exploration=0.3,
            learning_starts=1000,
            train_freq=1,
            policy_kwargs=dict(layers=[256, 256, 256]),
            tensorboard_log="./OpenAI/")
# Train the model
model.learn(int(8e6))

model.save("./model2")

# WARNING: you must pass an env
# or wrap your environment with HERGoalEnvWrapper to use the predict method
model = HER.load('./model2', env=env)
    def train_single(self, env_name="Merging-v0"):
        """
        Directly trains on env_name
        """
        for seed in [201, 202, 203, 204, 205]:
            print(f"\ntraining with bsize {self.bs}, seed{seed}")
            self.seed = seed
            self.experiment_name = f"B{self.bs}R{seed}"
            print("EXPT NAME: ", self.experiment_dir1, self.experiment_name)
            self.experiment_dir = os.path.join(self.experiment_dir1,
                                               self.experiment_name)
            self.create_eval_dir()
            self.model = None
            env = gym.make(env_name)
            eval_env = gym.make(env_name)
            env._set_barrier_size(self.bs)
            env._set_homotopy_class('right')
            eval_env._set_barrier_size(self.bs)
            eval_env._set_homotopy_class('right')
            if self.model_type == "PPO":
                if self.is_save:
                    ### DEEPER NETWORK
                    #policy_kwargs = dict(net_arch=[dict(pi=[64, 64, 64, 64],
                    #                                    vf=[64, 64, 64, 64])]
                    #                                    )
                    #self.PPO = PPO2('MlpPolicy', env, verbose=1, seed=self.seed, learning_rate=1e-3,
                    #                policy_kwargs=policy_kwargs)
                    ### DROPOUT
                    #self.PPO = PPO2(MlpGeneralPolicy1, env, verbose=1, seed=self.seed, learning_rate=1e-3)
                    ### REGULAR
                    self.PPO = PPO2('MlpPolicy',
                                    env,
                                    verbose=1,
                                    seed=self.seed,
                                    learning_rate=1e-3)
                else:
                    self.PPO = PPO2('MlpPolicy',
                                    env,
                                    verbose=1,
                                    seed=self.seed,
                                    learning_rate=1e-3)

                self.model = train(self.PPO, eval_env, self.timesteps,
                                   self.experiment_dir, self.is_save,
                                   self.eval_save_period, self.rets_path, 0)
            elif self.model_type == "DQN":
                if self.is_save:
                    self.DQN = DQN(
                        'MlpPolicy',
                        env,
                        verbose=1,
                        seed=self.seed,
                        prioritized_replay=True,
                        learning_rate=1e-3,
                        tensorboard_log="./Gridworldv1_tensorboard/" +
                        self.experiment_name,
                        full_tensorboard_log=True)
                else:
                    self.DQN = DQN('MlpPolicy',
                                   env,
                                   verbose=1,
                                   seed=self.seed,
                                   prioritized_replay=True,
                                   learning_rate=1e-3)
                self.model = train(self.DQN, eval_env, self.timesteps,
                                   self.experiment_dir, self.is_save,
                                   self.eval_save_period, self.rets_path, 0)
            elif self.model_type == "HER":
                env = HERGoalEnvWrapper(env)
                eval_env = HERGoalEnvWrapper(eval_env)
                print("bs: ", env.env.barrier_size)
                print("hc: ", env.env.homotopy_class)
                self.HER = HER('MlpPolicy',
                               env,
                               DDPG,
                               n_sampled_goal=4,
                               goal_selection_strategy="future",
                               seed=self.seed,
                               verbose=1)
                self.model = train(self.HER, eval_env, self.timesteps,
                                   self.experiment_dir, self.is_save,
                                   self.eval_save_period, self.rets_path, 0)
Exemple #22
0
env_name = 'blueprint_construction'
core_dir = '/Users/abhijithneilabraham/Documents/GitHub/multi-agent-emergence-environments/'
envs_dir = 'mae_envs/envs'
xmls_dir = 'xmls'
env, _ = load_env(env_name,
                  core_dir=core_dir,
                  envs_dir=envs_dir,
                  xmls_dir=xmls_dir,
                  return_args_remaining=True)
# Available strategies (cf paper): future, final, episode, random
goal_selection_strategy = 'future'  # equivalent to GoalSelectionStrategy.FUTURE

# Wrap the model
model = HER('MlpPolicy',
            env,
            model_class,
            n_sampled_goal=4,
            goal_selection_strategy=goal_selection_strategy,
            verbose=1)
# Train the model
model.learn(1000)

model.save("./her_bit_env")

# WARNING: you must pass an env
# or wrap your environment with HERGoalEnvWrapper to use the predict method
model = HER.load('./her_bit_env', env=env)

obs = env.reset()
for _ in range(100):
    action, _ = model.predict(obs)
    obs, reward, done, _ = env.step(action)
Exemple #23
0
def callback(_locals, _globals):
    global n_steps

    n_steps += 1
    if n_steps % 50000 == 0 or n_steps == 10000:
        print('Saving: ', n_steps)
        save_path = 'checkpoints/yumi/her/her_{}_task_{}_{}.npy'.format(
            name, args.task, n_steps)
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        model.save(save_path)

    return True


model = HER('MlpPolicy',
            env,
            model_class=DDPG,
            verbose=1,
            tensorboard_log=log_dir,
            **dict(random_exploration=.2))
model.learn(total_timesteps=total_timesteps, callback=callback)
model.save("her-yumi-{}-final".format(n_steps))

env.save_running_average(log_dir)

obs = env.reset()
for i in range(100):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    env.render()
# !pip install highway-env
# !pip install stable-baselines==2.10.0

# Environment
import gym
import highway_env

# Agent
from stable_baselines import HER, SAC

"""## Training"""

env = gym.make("parking-v0")
model = HER('MlpPolicy', env, SAC, n_sampled_goal=4,
            goal_selection_strategy='future',
            verbose=1, buffer_size=int(1e6),
            learning_rate=1e-3,
            gamma=0.9, batch_size=256,
            policy_kwargs=dict(layers=[256, 256, 256]))
model.learn(int(5e4))

"""## Visualize a few episodes

We first define a simple helper function for visualization of episodes:
"""

# !pip install gym pyvirtualdisplay
# !apt-get install -y xvfb python-opengl ffmpeg

from IPython import display as ipythondisplay
from pyvirtualdisplay import Display
from gym.wrappers import Monitor
Exemple #25
0
from stable_baselines.common.policies import MlpPolicy
from stable_baselines import HER, SAC

print('setting up environment')
#env = gym.make("REALRobot2020-R2J3-v0")
env = REALRobotEnv(objects=1)
# Currently this wrapper doesn't really return goals but just sample_placeholder
# to match the her format.
env = GoalWrapper(env, crop_obs=True)

print('setting up model')
model = HER('MlpPolicy',
            env,
            SAC,
            n_sampled_goal=4,
            goal_selection_strategy='future',
            verbose=1,
            buffer_size=int(1e6),
            learning_rate=1e-3,
            gamma=0.95,
            batch_size=256)
print('start learning')
model.learn(total_timesteps=256)
print('learning done')

#Here we need to restart the environent to make rendering possible
#(doesn't work with the wrappers right now)
env = REALRobotEnv(objects=1)
env = GoalWrapper(env, crop_obs=True)
env.render("human")

print('display model')
    def get_goal(self):
        """Get a random goal depending on the difficulty."""
        self.difficulty = np.random.choice(self.difficulties)
        return move_cube.sample_goal(difficulty=self.difficulty)


if __name__ == "__main__":

    difficulties = np.arange(1, 5)
    env = gym.make(
        'Example_CubeEnv-v0',
        initializer=Initializer(difficulty=difficulties),
        action_type=cube_env.ActionType.POSITION,
        visualization=False,
    )

    model_kwargs = {
        'ent_coef': 'auto',
        'buffer_size': int(1e6),
        'gamma': 0.95,
        'learning_starts': 1000,
        'train_freq': 1
    }

    model = HER('MlpPolicy', env, SAC, verbose=True, **model_kwargs)

    model.learn(int(8e6))
    model.save("./hersac_CubeEnv_diffall")

    os.system("shutdown now")
    'render': render,
    'param_noise': None,
    'action_noise': action_noise,
    'normalize_observations': normalize,
    'nb_train_steps': nb_train_steps,
    'nb_rollout_steps': nb_rollout_steps,
    'batch_size': batch_size,
    'critic_l2_reg': critic_l2_reg,
    'buffer_size': buffer_size,
    'random_exploration': random_exploration,
    'policy_kwargs': {
        'layer_norm': True
    },
    'logging': suff
}
model = HER('MlpPolicy', env, DDPG, **kwargs)
start = time.time()

model.learn(total_timesteps=total_timesteps, log_interval=1)

if log:
    model.save("pkl/{}".format(suff))
    print(
        "Saved as {0}, trained {1} primitive policy for {2} timesteps in {3}".
        format(suff, policy, total_timesteps,
               time.strftime('%H:%M:%S', time.gmtime(time.time() - start))))

else:
    print("Trained {0} primitive policy for {1} timesteps in {2}".format(
        policy, total_timesteps,
        time.strftime('%H:%M:%S', time.gmtime(time.time() - start))))
Exemple #28
0
def launchAgent(env_name: int,
                model_name: str,
                test_mode=False,
                filepath=None):
    """
    :param test_mode: 에이전트를 테스트 모드로 불러와 주행시킬지를 확인하는 모드입니다. 이럴 시에 학습은 이루어지지 않으며, 주행만 이루어집니다.
    :param env_name: 불러올 환경의 이름입니다.
        1 : 미니맵 이미지를 사용하지 않은, 점 사이의 거리 계산을 한 환경입니다.
        2 : 미니맵 이미지를 사용하고, 보상을 업데이트한 모델입니다.
        다른 값(기본) : 현재 쓰는 모델입니다. 미니맵 이미지를 사용하고, 보상을 다시 업데이트한 모델입니다.
    :param model_name: 설정할 모델의 이름입니다.
        DQN : DQN 모델을 불러옵니다.
        HER : HER 모델을 불러옵니다.
        다른 값(기본) : PPO2 모델을 불러옵니다.
    :return: 마지막으로 episode를 수행한 모델을 return합니다.
    """

    from stable_baselines import DQN, HER, PPO2

    if env_name == 1:
        from Reinforcement_AI.env.a_env import KartEnv
        kart_env = KartEnv()
        policy = "MlpPolicy"
    elif env_name == 2:
        from Reinforcement_AI.env.d_image_env import DetailedMiniMapEnv as DetailedMiniMapEnv1
        kart_env = DetailedMiniMapEnv1()
        policy = "CnnPolicy"
    elif env_name == 3:
        from Reinforcement_AI.env.a_env2 import KartEnv
        kart_env = KartEnv()
        policy = "MlpPolicy"
    elif env_name == 4:
        from Reinforcement_AI.env.a_env3 import KartEnv
        kart_env = KartEnv()
        policy = "MlpPolicy"
    else:  #env_name == "detailed_minimap_enhanced" or env_name == "4":
        from Reinforcement_AI.env.e_enhanced_image_env import DetailedMiniMapEnv as DetailedMiniMapEnv2
        kart_env = DetailedMiniMapEnv2()
        policy = "CnnPolicy"

    if model_name == "DQN":
        model = DQN(policy=policy,
                    env=kart_env,
                    double_q=True,
                    prioritized_replay=True,
                    verbose=1)
    elif model_name == "HER":
        model = HER(policy=policy, env=kart_env, model_class=DQN, verbose=1)
    else:  # model_name == "PPO2"
        model = PPO2(policy=policy,
                     learning_rate=0.0001,
                     env=kart_env,
                     verbose=1)

    if test_mode:  # 테스트 모드일때 에이전트 불러와서 작동하게함
        model.load(filepath)
        kart_env.set_continuos(True)

        while True:
            observation = kart_env.reset()
            while True:
                action, _states = model.predict(observation)
                observation, rewards, dones, info = kart_env.step(action)
                if dones:
                    break

    else:
        for i in range(1000):
            model.learn(total_timesteps=12500)
            model.save(str(env_name) + "_" + model_name + "_" + str(i + 1))
Exemple #29
0
import gym
import time

from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2, DQN, HER, DDPG

import synergyenvs

env = gym.make("GraspBoxPybullet-v0")
env.render()
o = env.reset()

# model = PPO2(MlpPolicy, env, verbose=1)
model = HER('MlpPolicy', env, DDPG, n_sampled_goal=4, verbose=0)
model.load("./her_graspbox-1")

env.camera_adjust()

for _ in range(10000):
    env.render()
    action, _states = model.predict(o)
    # action = env.action_space.sample()
    o, r, done, info = env.step(action)
    print(o, r, done, info)
    if done:
        o = env.reset()
    time.sleep(0.1)

env.close()
Exemple #30
0
def train_decision(config=None,
                   save=False,
                   load=False,
                   calender=None,
                   history=None,
                   predict_results_dict=None,
                   test_mode=False,
                   start_date=None,
                   stop_date=None,
                   episode_steps=1000,
                   model='DDPG'):
    """
    训练决策模型,从数据库读取数据并进行决策训练

    参数:
        config:配置文件, 
        save:保存结果, 
        calender:交易日日历, 
        history:行情信息, 
        all_quotes:拼接之后的行情信息
        predict_results_dict:预测结果信息
    """
    # 首先处理预测数据中字符串日期

    MODEL = model

    predict_dict = {}
    for k, v in predict_results_dict.items():
        assert isinstance(v['predict_date'].iloc[0], str)
        tmp = v['predict_date'].apply(
            lambda x: arrow.get(x, 'YYYY-MM-DD').date())
        predict_dict[k] = v.rename(index=tmp)

    env = Portfolio_Prediction_Env(config=config,
                                   calender=calender,
                                   stock_history=history,
                                   window_len=1,
                                   prediction_history=predict_dict,
                                   start_trade_date=start_date,
                                   stop_trade_date=stop_date,
                                   save=save)

    # 测试模式
    if test_mode:
        obs = env.reset()
        # check_env(env)
        for i in range(1000):
            W = np.random.uniform(0.0, 1.0, size=(6, ))
            offer = np.random.uniform(-10.0, 10.0, size=(6, ))
            obs, reward, done, infos = env.step(np.hstack((W, offer)))
            # env.render()
            if done:
                env.save_history()
                break
        env.close()

    # 训练模式
    if MODEL == "DDPG":
        # 添加噪声
        n_actions = env.action_space.shape
        param_noise = None
        # 适合于惯性系统控制的OU噪声
        action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                    sigma=float(0.5) *
                                                    np.ones(n_actions))

        model_path = search_file(
            os.path.join(sys.path[0], 'saved_models', MODEL), MODEL)
        if len(model_path) > 0 and load:
            model = DDPG.load(
                model_path[0],
                env=env,
                policy=CustomDDPGPolicy,
                param_noise=param_noise,
                action_noise=action_noise,
                # tensorboard_log='./tb_log',
            )
        else:
            model = DDPG(
                policy=CustomDDPGPolicy,
                env=env,
                verbose=1,
                param_noise=param_noise,
                action_noise=action_noise,
                # tensorboard_log='./tb_log',
            )
        # 训练步数
        model.learn(total_timesteps=episode_steps, )
        model.save(
            os.path.join(sys.path[0], 'saved_models', MODEL, MODEL + '.h5'))

    elif MODEL == 'TD3':
        n_actions = env.action_space.shape[-1]
        # 适合于惯性系统控制的OU噪声
        action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                    sigma=float(0.5) *
                                                    np.ones(n_actions))

        model_path = search_file(
            os.path.join(sys.path[0], 'saved_models', MODEL), MODEL)
        if len(model_path) > 0 and load:
            model = TD3.load(
                model_path[0],
                env=env,
                policy=CustomTD3Policy,
                action_noise=action_noise,
                # tensorboard_log='./tb_log',
            )
        else:
            model = TD3(
                policy=CustomTD3Policy,
                env=env,
                verbose=1,
                action_noise=action_noise,
                # tensorboard_log='./tb_log',
            )
        # 训练步数
        model.learn(total_timesteps=episode_steps, )
        model.save(
            os.path.join(sys.path[0], 'saved_models', MODEL, MODEL + '.h5'))

    elif MODEL == "HER":
        """
        env必须是GoalEnv
        """
        model_class = DDPG

        # Available strategies (cf paper): future, final, episode, random
        goal_selection_strategy = 'future'  # equivalent to GoalSelectionStrategy.FUTURE

        # Wrap the model
        model = HER(policy=CustomDDPGPolicy,
                    env=env,
                    model_class=model_class,
                    n_sampled_goal=4,
                    goal_selection_strategy=goal_selection_strategy,
                    verbose=1)
        model.learn(total_timesteps=episode_steps, )
        model.save(
            os.path.join(sys.path[0], 'saved_models', MODEL, MODEL + '.h5'))

    obs = env.reset()
    # 实测模式
    for i in range(1000):
        action, _states = model.predict(obs)
        obs, reward, done, info = env.step(action)
        # env.render(info=info)
        if done:
            if save:
                env.save_history()
            env.reset()
            break

    env.close()