Example #1
0
    def actor_forward(self, obs, deterministic=False):
        latent_pi, _ = self._get_latent(obs)
        _, action, _ = self._get_action_dist_from_latent(latent_pi, deterministic=deterministic)
        return tf.stop_gradient(action).numpy()

    @tf.function
    def evaluate_actions(self, obs, action, deterministic=False):
        """
        Evaluate actions according to the current policy,
        given the observations.

        :param obs: (th.Tensor)
        :param action: (th.Tensor)
        :param deterministic: (bool)
        :return: (th.Tensor, th.Tensor, th.Tensor) estimated value, log likelihood of taking those actions
            and entropy of the action distribution.
        """
        latent_pi, latent_vf = self._get_latent(obs)
        _, _, action_distribution = self._get_action_dist_from_latent(latent_pi, deterministic=deterministic)
        log_prob = action_distribution.log_prob(action)
        value = self.value_net(latent_vf)
        return value, log_prob, action_distribution.entropy()

    def value_forward(self, obs):
        _, latent_vf, _ = self._get_latent(obs)
        return self.value_net(latent_vf)

MlpPolicy = PPOPolicy

register_policy("MlpPolicy", MlpPolicy)
Example #2
0
def main(args):
    log_dir = args.log_path if (
        args.log_path is not None
    ) else "/tmp/stable_baselines_" + time.strftime('%Y-%m-%d-%H-%M-%S')
    if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
        rank = 0
        configure_logger(log_dir)
    else:
        rank = MPI.COMM_WORLD.Get_rank()
        configure_logger(log_dir, format_strs=[])

    set_global_seeds(args.seed)

    model_class = SAC_SIR  # works also with SAC, DDPG and TD3

    env_kwargs = get_env_kwargs(args.env,
                                random_ratio=args.random_ratio,
                                sequential=args.sequential,
                                reward_type=args.reward_type,
                                n_object=args.n_object)

    def make_thunk(rank):
        return lambda: make_env(
            env_id=args.env, rank=rank, log_dir=log_dir, kwargs=env_kwargs)

    env = ParallelSubprocVecEnv(
        [make_thunk(i) for i in range(args.num_workers)], reset_when_done=True)

    def make_thunk_aug(rank):
        return lambda: FlattenDictWrapper(
            make_env(env_id=aug_env_name, rank=rank, kwargs=aug_env_kwargs),
            ['observation', 'achieved_goal', 'desired_goal'])

    aug_env_kwargs = env_kwargs.copy()
    del aug_env_kwargs['max_episode_steps']
    aug_env_name = args.env.split('-')[0] + 'Unlimit-' + args.env.split('-')[1]
    aug_env = ParallelSubprocVecEnv(
        [make_thunk_aug(i) for i in range(args.num_workers)],
        reset_when_done=False)

    if os.path.exists(os.path.join(logger.get_dir(), 'eval.csv')):
        os.remove(os.path.join(logger.get_dir(), 'eval.csv'))
        print('Remove existing eval.csv')
    eval_env_kwargs = env_kwargs.copy()
    eval_env_kwargs['random_ratio'] = 0.0
    eval_env = make_env(env_id=args.env, rank=0, kwargs=eval_env_kwargs)
    eval_env = FlattenDictWrapper(
        eval_env, ['observation', 'achieved_goal', 'desired_goal'])

    if not args.play:
        os.makedirs(log_dir, exist_ok=True)

    # Available strategies (cf paper): future, final, episode, random
    goal_selection_strategy = 'future'  # equivalent to GoalSelectionStrategy.FUTURE

    if not args.play:
        from stable_baselines.ddpg.noise import NormalActionNoise
        noise_type = args.action_noise.split('_')[0]
        if noise_type == 'none':
            parsed_action_noise = None
        elif noise_type == 'normal':
            sigma = float(args.action_noise.split('_')[1])
            parsed_action_noise = NormalActionNoise(
                mean=np.zeros(env.action_space.shape),
                sigma=sigma * np.ones(env.action_space.shape))
        else:
            raise NotImplementedError

        train_kwargs = get_train_kwargs("sac_sir", args, parsed_action_noise,
                                        eval_env, aug_env)

        def callback(_locals, _globals):
            if _locals['step'] % int(1e3) == 0:
                if 'FetchStack' in args.env:
                    mean_eval_reward = stack_eval_model(
                        eval_env,
                        _locals["self"],
                        init_on_table=(args.env == 'FetchStack-v2'))
                elif 'MasspointPushDoubleObstacle-v2' in args.env:
                    mean_eval_reward = egonav_eval_model(
                        eval_env,
                        _locals["self"],
                        env_kwargs["random_ratio"],
                        fixed_goal=np.array([4., 4., 0.15, 0., 0., 0., 1.]))
                    mean_eval_reward2 = egonav_eval_model(
                        eval_env,
                        _locals["self"],
                        env_kwargs["random_ratio"],
                        goal_idx=0,
                        fixed_goal=np.array([4., 4., 0.15, 1., 0., 0., 0.]))
                    log_eval(_locals['self'].num_timesteps,
                             mean_eval_reward2,
                             file_name="eval_box.csv")
                else:
                    mean_eval_reward = eval_model(eval_env, _locals["self"])
                log_eval(_locals['self'].num_timesteps, mean_eval_reward)
            if _locals['step'] % int(2e4) == 0:
                model_path = os.path.join(
                    log_dir, 'model_' + str(_locals['step'] // int(2e4)))
                model.save(model_path)
                print('model saved to', model_path)
            return True

        class CustomSACPolicy(SACPolicy):
            def __init__(self, *model_args, **model_kwargs):
                super(CustomSACPolicy, self).__init__(
                    *model_args,
                    **model_kwargs,
                    layers=[256, 256] if 'MasspointPushDoubleObstacle'
                    in args.env else [256, 256, 256, 256],
                    feature_extraction="mlp")

        register_policy('CustomSACPolicy', CustomSACPolicy)
        from utils.sac_attention_policy import AttentionPolicy
        register_policy('AttentionPolicy', AttentionPolicy)
        policy_kwargs = get_policy_kwargs("sac_sir", args)

        if rank == 0:
            print('train_kwargs', train_kwargs)
            print('policy_kwargs', policy_kwargs)
        # Wrap the model
        model = HER2(args.policy,
                     env,
                     model_class,
                     n_sampled_goal=4,
                     start_augment_time=args.start_augment,
                     goal_selection_strategy=goal_selection_strategy,
                     num_workers=args.num_workers,
                     policy_kwargs=policy_kwargs,
                     verbose=1,
                     **train_kwargs)
        print(model.get_parameter_list())

        # Train the model
        model.learn(
            int(args.num_timesteps),
            seed=args.seed,
            callback=callback,
            log_interval=100 if not ('MasspointMaze-v3' in args.env) else 10)

        if rank == 0:
            model.save(os.path.join(log_dir, 'final'))
def main(args):
    log_dir = args.log_path if (args.log_path is not None) else \
        "/tmp/stable_baselines_" + time.strftime('%Y-%m-%d-%H-%M-%S')
    configure_logger(log_dir)

    set_global_seeds(args.seed)

    n_cpu = get_num_workers(args.env) if not args.play else 1
    env_kwargs = get_env_kwargs(args.env, args.random_ratio, args.sequential,
                                args.reward_type, args.n_object,
                                args.curriculum)

    def make_thunk(rank):
        return lambda: make_env(env_id=args.env,
                                rank=rank,
                                log_dir=log_dir,
                                flatten_dict=True,
                                kwargs=env_kwargs)

    env = SubprocVecEnv([make_thunk(i) for i in range(n_cpu)])

    eval_env_kwargs = env_kwargs.copy()
    eval_env_kwargs['random_ratio'] = 0.0
    if "use_cu" in eval_env_kwargs:
        eval_env_kwargs['use_cu'] = False
    eval_env = make_env(env_id=args.env,
                        rank=0,
                        flatten_dict=True,
                        kwargs=eval_env_kwargs)
    print(eval_env)
    if not args.play:
        os.makedirs(log_dir, exist_ok=True)
        train_kwargs = get_train_kwargs("ppo",
                                        args,
                                        parsed_action_noise=None,
                                        eval_env=eval_env)

        # policy = 'MlpPolicy'
        from utils.attention_policy import AttentionPolicy
        register_policy('AttentionPolicy', AttentionPolicy)
        policy_kwargs = get_policy_kwargs("ppo", args)
        print(policy_kwargs)

        model = PPO2(args.policy,
                     env,
                     verbose=1,
                     nminibatches=32,
                     lam=0.95,
                     noptepochs=10,
                     ent_coef=0.01,
                     learning_rate=3e-4,
                     cliprange=0.2,
                     policy_kwargs=policy_kwargs,
                     **train_kwargs)
        print(model.get_parameter_list())

        def callback(_locals, _globals):
            num_update = _locals["update"]
            if 'FetchStack' in args.env:
                mean_eval_reward = stack_eval_model(eval_env, _locals["self"])
            else:
                mean_eval_reward = eval_model(eval_env, _locals["self"])
            log_eval(num_update, mean_eval_reward)
            if num_update % 10 == 0:
                model_path = os.path.join(log_dir,
                                          'model_' + str(num_update // 10))
                model.save(model_path)
                print('model saved to', model_path)
            return True

        model.learn(total_timesteps=int(args.num_timesteps),
                    callback=callback,
                    seed=args.seed,
                    log_interval=1)
        model.save(os.path.join(log_dir, 'final'))

    else:
        assert args.load_path is not None
        model = PPO2.load(args.load_path)
        fig, ax = plt.subplots(1, 1, figsize=(8, 8))
        obs = env.reset()
        goal_dim = env.get_attr('goal')[0].shape[0]
        if 'FetchStack' in args.env:
            while env.get_attr('current_nobject')[0] != env.get_attr('n_object')[0] or \
                    env.get_attr('task_mode')[0] != 1:
                obs = env.reset()
        elif 'FetchPush' in args.env:
            while not (1.25 < obs[0][6] < 1.33 and obs[0][7] < 0.61
                       and 0.7 < obs[0][4] < 0.8):
                obs = env.reset()
            env.env_method('set_goal', np.array([1.2, 0.75, 0.425, 1, 0]))
            obs = env.env_method('get_obs')
            obs[0] = np.concatenate([
                obs[0][key]
                for key in ['observation', 'achieved_goal', 'desired_goal']
            ])
        else:
            while np.argmax(obs[0][-goal_dim + 3:]) != 0:
                obs = env.reset()
        print('achieved_goal', obs[0][-2 * goal_dim:-goal_dim], 'goal',
              obs[0][-goal_dim:])
        episode_reward = 0.0
        num_episode = 0
        frame_idx = 0
        images = []
        if 'max_episode_steps' not in env_kwargs.keys():
            env_kwargs['max_episode_steps'] = 100
        for i in range(env_kwargs['max_episode_steps'] * 10):
            img = env.render(mode='rgb_array')
            ax.cla()
            ax.imshow(img)
            if env.get_attr('goal')[0].shape[0] <= 3:
                ax.set_title('episode ' + str(num_episode) + ', frame ' +
                             str(frame_idx))
            else:
                ax.set_title('episode ' + str(num_episode) + ', frame ' +
                             str(frame_idx) + ', goal idx ' +
                             str(np.argmax(env.get_attr('goal')[0][3:])))
                if 'FetchStack' in args.env:
                    tasks = ['pick and place', 'stack']
                    ax.set_title('episode ' + str(num_episode) + ', frame ' +
                                 str(frame_idx) + ', task: ' +
                                 tasks[np.argmax(obs[0][-2 * goal_dim - 2:-2 *
                                                        goal_dim])])
            images.append(img)
            action, _ = model.predict(obs)
            obs, reward, done, _ = env.step(action)
            episode_reward += reward
            frame_idx += 1
            if not args.export_video:
                plt.pause(0.1)
            else:
                plt.imsave(
                    os.path.join(os.path.dirname(args.load_path),
                                 'tempimg%d.png' % i), img)
            if done:
                print('episode_reward', episode_reward)
                if 'FetchStack' in args.env:
                    while env.get_attr('current_nobject')[0] != env.get_attr('n_object')[0] or \
                            env.get_attr('task_mode')[0] != 1:
                        obs = env.reset()
                else:
                    while np.argmax(obs[0][-goal_dim + 3:]) != 0:
                        obs = env.reset()
                print('goal', obs[0][-goal_dim:])
                episode_reward = 0.0
                frame_idx = 0
                num_episode += 1
                if num_episode >= 10:
                    break
        if args.export_video:
            os.system('ffmpeg -r 5 -start_number 0 -i ' +
                      os.path.dirname(args.load_path) +
                      '/tempimg%d.png -c:v libx264 -pix_fmt yuv420p ' +
                      os.path.join(os.path.dirname(args.load_path), args.env +
                                   '.mp4'))
            for i in range(env_kwargs['max_episode_steps'] * 10):
                try:
                    os.remove(
                        os.path.join(os.path.dirname(args.load_path),
                                     'tempimg' + str(i) + '.png'))
                except:
                    pass
Example #4
0
class CustomMlpPolicy(BasePolicy):
    def __init__(self, *args, **kwargs):
        super(CustomMlpPolicy, self).__init__(*args, **kwargs,
                                              layers=[16],
                                              feature_extraction="mlp")


class CustomSACPolicy(SACPolicy):
    def __init__(self, *args, **kwargs):
        super(CustomSACPolicy, self).__init__(*args, **kwargs,
                                              layers=[256, 256],
                                              feature_extraction="mlp")


register_policy('CustomSACPolicy', CustomSACPolicy)
register_policy('CustomDQNPolicy', CustomDQNPolicy)
register_policy('CustomMlpPolicy', CustomMlpPolicy)


def flatten_dict_observations(env):
    assert isinstance(env.observation_space, gym.spaces.Dict)
    keys = env.observation_space.spaces.keys()
    return gym.wrappers.FlattenDictWrapper(env, dict_keys=list(keys))


def get_wrapper_class(hyperparams):
    """
    Get a Gym environment wrapper class specified as a hyper parameter
    "env_wrapper".
    e.g.
Example #5
0

if __name__ == "__main__":

    #argparse to define the mode train or predict 
    args=parser.parse_args()
    mode = args.mode
    ####

    if mode =='train':

        env = GymACRoom()
        
        learning_rate = [0.0001]
        # Register the policy, it will check that the name is not already taken
        register_policy('CustomPolicy', CustomPolicy)
        
        for lr in learning_rate:
            model = PPO2(policy = 'CustomPolicy', env=env, verbose=1, learning_rate=lr, n_steps=1280, tensorboard_log="./AC_tensorboard/")
            model.learn(total_timesteps = 1000000)
            #model.save("AC_PPO2_LR_exp"+str(lr)+".zip")
            model.save("AC_PPO2_exp_neg_noOffset.zip")
        
        '''
        # scheduling learning rate
        #lr= 0.1
        
        model = PPO2('MlpPolicy', env, verbose=1, learning_rate=0.1, tensorboard_log="./AC_tensorboard/")
        model.learn(total_timesteps = 100000)
        model.learning_rate = 0.01
        model.learn(total_timesteps = 100000)
Example #6
0
def main(args):
    log_dir = args.log_path if (args.log_path is not None) else \
        "/tmp/stable_baselines_" + time.strftime('%Y-%m-%d-%H-%M-%S')
    configure_logger(log_dir)

    set_global_seeds(args.seed)

    n_cpu = get_num_workers(args.env) if not args.play else 1

    env_kwargs = get_env_kwargs(args.env, args.random_ratio, args.sequential,
                                args.reward_type, args.n_object,
                                args.curriculum)

    def make_thunk(rank):
        return lambda: make_env(env_id=args.env,
                                rank=rank,
                                log_dir=log_dir,
                                flatten_dict=True,
                                kwargs=env_kwargs)

    env = SubprocVecEnv([make_thunk(i) for i in range(n_cpu)])

    aug_env_name = args.env.split('-')[0] + 'Unlimit-' + args.env.split('-')[1]
    aug_env_kwargs = env_kwargs.copy()
    aug_env_kwargs['max_episode_steps'] = None

    def make_thunk_aug(rank):
        return lambda: make_env(env_id=aug_env_name,
                                rank=rank,
                                flatten_dict=True,
                                kwargs=aug_env_kwargs)

    if not args.parallel:
        aug_env = make_env(env_id=aug_env_name,
                           rank=0,
                           flatten_dict=True,
                           kwargs=aug_env_kwargs)
    else:
        aug_env = ParallelSubprocVecEnv(
            [make_thunk_aug(i) for i in range(min(32, n_cpu))],
            reset_when_done=False)
    print(aug_env)

    if os.path.exists(os.path.join(logger.get_dir(), 'eval.csv')):
        os.remove(os.path.join(logger.get_dir(), 'eval.csv'))
        print('Remove existing eval.csv')
    eval_env_kwargs = env_kwargs.copy()
    eval_env_kwargs['random_ratio'] = 0.0
    if "use_cu" in eval_env_kwargs:
        eval_env_kwargs['use_cu'] = False
    eval_env = make_env(env_id=args.env,
                        rank=0,
                        flatten_dict=True,
                        kwargs=eval_env_kwargs)
    print(eval_env)

    if not args.play:
        os.makedirs(log_dir, exist_ok=True)

        from utils.attention_policy import AttentionPolicy
        register_policy('AttentionPolicy', AttentionPolicy)

        policy_kwargs = get_policy_kwargs("ppo_sir", args)

        train_kwargs = get_train_kwargs("ppo_sir",
                                        args,
                                        parsed_action_noise=None,
                                        eval_env=eval_env,
                                        aug_env=aug_env)

        model = PPO2_SIR(args.policy,
                         env,
                         verbose=1,
                         nminibatches=32,
                         lam=0.95,
                         gamma=0.99,
                         noptepochs=10,
                         ent_coef=0.01,
                         learning_rate=3e-4,
                         cliprange=0.2,
                         policy_kwargs=policy_kwargs,
                         horizon=env_kwargs['max_episode_steps'],
                         **train_kwargs)

        def callback(_locals, _globals):
            num_update = _locals["update"]
            if 'FetchStack' in args.env:
                mean_eval_reward = stack_eval_model(eval_env, _locals["self"])
            else:
                mean_eval_reward = eval_model(eval_env, _locals["self"])
            log_eval(num_update, mean_eval_reward)
            if num_update % 10 == 0:
                model_path = os.path.join(log_dir,
                                          'model_' + str(num_update // 10))
                model.save(model_path)
                print('model saved to', model_path)
            return True

        model.learn(total_timesteps=int(args.num_timesteps),
                    callback=callback,
                    seed=args.seed,
                    log_interval=1)
        model.save(os.path.join(log_dir, 'final'))
Example #7
0
#         with tf.variable_scope(scope, reuse=reuse):

#             qf_h = tf.layers.flatten(obs)
#             for i, layer_size in enumerate(self.layers):
#                 qf_h = tf.layers.dense(qf_h, layer_size, name='fc' + str(i))
#                 if self.layer_norm:
#                     qf_h = tf.contrib.layers.layer_norm(qf_h, center=True, scale=True)
#                 qf_h = self.activ(qf_h)
#                 if i == 0:
#                     qf_h = tf.concat([qf_h, action], axis=-1)

#             # the name attribute is used in pop-art normalization
#             qvalue_fn = tf.layers.dense(qf_h, 1, name='qf_output',
#                                         kernel_initializer=tf.zeros_initializer)#random_uniform_initializer(minval=-3e-3,

#             self.qvalue_fn = qvalue_fn
#             self._qvalue = qvalue_fn[:, 0]
#         return self.qvalue_fn

#     def step(self, obs, state=None, mask=None):
#         return self.sess.run(self.policy, {self.obs_ph: obs})

#     def proba_step(self, obs, state=None, mask=None):
#         return self.sess.run(self.policy, {self.obs_ph: obs})

#     def value(self, obs, action, state=None, mask=None):
#         return self.sess.run(self._qvalue, {self.obs_ph: obs, self.action_ph: action})

register_policy("SoftPolicy", SoftPolicy)
    :param n_batch: (int) The number of batch to run (n_envs * n_steps)
    :param reuse: (bool) If the policy is reusable or not
    :param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction
    """
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 n_env=1,
                 n_steps=1,
                 n_batch=None,
                 reuse=False,
                 **_kwargs):
        super(LnMlpPolicy, self).__init__(sess,
                                          ob_space,
                                          ac_space,
                                          n_env,
                                          n_steps,
                                          n_batch,
                                          reuse,
                                          feature_extraction="mlp",
                                          layer_norm=True,
                                          **_kwargs)


register_policy("CnnPolicy", CnnPolicy)
register_policy("LnCnnPolicy", LnCnnPolicy)
register_policy("MlpPolicy", MlpPolicy)
register_policy("LnMlpPolicy", LnMlpPolicy)
register_policy("CustomSACPolicy", CustomSACPolicy)
Example #9
0
                                          **kwargs,
                                          net_arch=params.net_arch,
                                          act_fun=params.act_fun,
                                          PF_linear_also=True)


class MLP_Policy(FeedForwardPolicy):
    def __init__(self, *args, **kwargs):
        super(MLP_Policy, self).__init__(*args,
                                         **kwargs,
                                         net_arch=params.net_arch,
                                         act_fun=params.act_fun,
                                         feature_extraction="mlp")


register_policy('SCN', SCN)
register_policy('SCN_PF_NOnly', SCN_PF_NOnly)
register_policy('SCN_PF_Both', SCN_PF_Both)
register_policy('MLP_Policy', MLP_Policy)


class CustomDQNPolicy(FeedForwardPolicy):
    def __init__(self, *args, **kwargs):
        super(CustomDQNPolicy, self).__init__(*args,
                                              **kwargs,
                                              layers=[64],
                                              layer_norm=True,
                                              feature_extraction="mlp")


class CustomMlpPolicy(BasePolicy):
def main(args):
    log_dir = args.log_path if (
        args.log_path is not None
    ) else "/tmp/stable_baselines_" + time.strftime('%Y-%m-%d-%H-%M-%S')
    if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
        rank = 0
        configure_logger(log_dir)
    else:
        rank = MPI.COMM_WORLD.Get_rank()
        configure_logger(log_dir, format_strs=[])

    set_global_seeds(args.seed)

    model_class = SAC_parallel

    n_workers = args.num_workers if not args.play else 1
    env_kwargs = get_env_kwargs(args.env,
                                random_ratio=args.random_ratio,
                                sequential=args.sequential,
                                reward_type=args.reward_type,
                                n_object=args.n_object)

    def make_thunk(rank):
        return lambda: make_env(
            env_id=args.env, rank=rank, log_dir=log_dir, kwargs=env_kwargs)

    env = ParallelSubprocVecEnv([make_thunk(i) for i in range(n_workers)],
                                reset_when_done=True)

    if os.path.exists(os.path.join(logger.get_dir(), 'eval.csv')):
        os.remove(os.path.join(logger.get_dir(), 'eval.csv'))
        print('Remove existing eval.csv')
    eval_env_kwargs = env_kwargs.copy()
    eval_env_kwargs['random_ratio'] = 0.0
    eval_env = make_env(env_id=args.env, rank=0, kwargs=eval_env_kwargs)
    eval_env = FlattenDictWrapper(
        eval_env, ['observation', 'achieved_goal', 'desired_goal'])

    if not args.play:
        os.makedirs(log_dir, exist_ok=True)

    # Available strategies (cf paper): future, final, episode, random
    goal_selection_strategy = 'future'  # equivalent to GoalSelectionStrategy.FUTURE

    if not args.play:
        from stable_baselines.ddpg.noise import NormalActionNoise
        noise_type = args.action_noise.split('_')[0]
        if noise_type == 'none':
            parsed_action_noise = None
        elif noise_type == 'normal':
            sigma = float(args.action_noise.split('_')[1])
            parsed_action_noise = NormalActionNoise(
                mean=np.zeros(env.action_space.shape),
                sigma=sigma * np.ones(env.action_space.shape))
        else:
            raise NotImplementedError
        train_kwargs = get_train_kwargs("sac", args, parsed_action_noise,
                                        eval_env)

        def callback(_locals, _globals):
            if _locals['step'] % int(1e3) == 0:
                if 'FetchStack' in args.env:
                    mean_eval_reward = stack_eval_model(
                        eval_env,
                        _locals["self"],
                        init_on_table=(args.env == 'FetchStack-v2'))
                elif 'MasspointPushDoubleObstacle-v2' in args.env:
                    mean_eval_reward = egonav_eval_model(
                        eval_env,
                        _locals["self"],
                        env_kwargs["random_ratio"],
                        fixed_goal=np.array([4., 4., 0.15, 0., 0., 0., 1.]))
                    mean_eval_reward2 = egonav_eval_model(
                        eval_env,
                        _locals["self"],
                        env_kwargs["random_ratio"],
                        goal_idx=0,
                        fixed_goal=np.array([4., 4., 0.15, 1., 0., 0., 0.]))
                    log_eval(_locals['self'].num_timesteps,
                             mean_eval_reward2,
                             file_name="eval_box.csv")
                else:
                    mean_eval_reward = eval_model(eval_env, _locals["self"])
                log_eval(_locals['self'].num_timesteps, mean_eval_reward)
            if _locals['step'] % int(2e4) == 0:
                model_path = os.path.join(
                    log_dir, 'model_' + str(_locals['step'] // int(2e4)))
                model.save(model_path)
                print('model saved to', model_path)
            return True

        class CustomSACPolicy(SACPolicy):
            def __init__(self, *model_args, **model_kwargs):
                super(CustomSACPolicy, self).__init__(
                    *model_args,
                    **model_kwargs,
                    layers=[256, 256] if 'MasspointPushDoubleObstacle'
                    in args.env else [256, 256, 256, 256],
                    feature_extraction="mlp")

        register_policy('CustomSACPolicy', CustomSACPolicy)
        from utils.sac_attention_policy import AttentionPolicy
        register_policy('AttentionPolicy', AttentionPolicy)
        policy_kwargs = get_policy_kwargs("sac", args)

        if rank == 0:
            print('train_kwargs', train_kwargs)
            print('policy_kwargs', policy_kwargs)
        # Wrap the model
        model = HER2(args.policy,
                     env,
                     model_class,
                     n_sampled_goal=4,
                     goal_selection_strategy=goal_selection_strategy,
                     num_workers=args.num_workers,
                     policy_kwargs=policy_kwargs,
                     verbose=1,
                     **train_kwargs)
        print(model.get_parameter_list())

        # Train the model
        model.learn(
            int(args.num_timesteps),
            seed=args.seed,
            callback=callback,
            log_interval=100 if not ('MasspointMaze-v3' in args.env) else 10)

        if rank == 0:
            model.save(os.path.join(log_dir, 'final'))

    # WARNING: you must pass an env
    # or wrap your environment with HERGoalEnvWrapper to use the predict method
    if args.play and rank == 0:
        assert args.load_path is not None
        model = HER2.load(args.load_path, env=env)

        fig, ax = plt.subplots(1, 1, figsize=(8, 8))
        obs = env.reset()
        if 'FetchStack' in args.env:
            env.env_method('set_task_array',
                           [[(env.get_attr('n_object')[0], 0)]])
            obs = env.reset()
            while env.get_attr('current_nobject')[0] != env.get_attr(
                    'n_object')[0] or env.get_attr('task_mode')[0] != 1:
                obs = env.reset()
        elif 'FetchPushWallObstacle' in args.env:
            while not (obs['observation'][0][4] > 0.7
                       and obs['observation'][0][4] < 0.8):
                obs = env.reset()
            env.env_method('set_goal', [np.array([1.18, 0.8, 0.425, 1, 0])])
            obs = env.env_method('get_obs')
            obs = {
                'observation': obs[0]['observation'][None],
                'achieved_goal': obs[0]['achieved_goal'][None],
                'desired_goal': obs[0]['desired_goal'][None]
            }
            # obs[0] = np.concatenate([obs[0][key] for key in ['observation', 'achieved_goal', 'desired_goal']])
        elif 'MasspointPushDoubleObstacle' in args.env or 'FetchPushWallObstacle' in args.env:
            while np.argmax(obs['desired_goal'][0][3:]) != 0:
                obs = env.reset()
        elif 'MasspointMaze-v2' in args.env:
            while obs['observation'][0][0] < 3 or obs['observation'][0][1] < 3:
                obs = env.reset()
            env.env_method('set_goal', [np.array([1., 1., 0.15])])
            obs = env.env_method('get_obs')
            obs = {
                'observation': obs[0]['observation'][None],
                'achieved_goal': obs[0]['achieved_goal'][None],
                'desired_goal': obs[0]['desired_goal'][None]
            }

        print('goal', obs['desired_goal'][0], 'obs', obs['observation'][0])
        episode_reward = 0.0
        images = []
        frame_idx = 0
        num_episode = 0
        for i in range(env_kwargs['max_episode_steps'] * 10):
            img = env.render(mode='rgb_array')
            ax.cla()
            ax.imshow(img)
            tasks = ['pick and place', 'stack']
            ax.set_title('episode ' + str(num_episode) + ', frame ' +
                         str(frame_idx) + ', task: ' +
                         tasks[np.argmax(obs['observation'][0][-2:])])
            images.append(img)
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, _ = env.step(action)
            episode_reward += reward
            frame_idx += 1
            if args.export_gif:
                plt.imsave(
                    os.path.join(os.path.dirname(args.load_path),
                                 'tempimg%d.png' % i), img)
            else:
                plt.pause(0.02)
            if done:
                print('episode_reward', episode_reward)
                obs = env.reset()
                if 'FetchStack' in args.env:
                    while env.get_attr('current_nobject')[0] != env.get_attr('n_object')[0] or \
                                    env.get_attr('task_mode')[0] != 1:
                        obs = env.reset()
                elif 'MasspointPushDoubleObstacle' in args.env or 'FetchPushWallObstacle' in args.env:
                    while np.argmax(obs['desired_goal'][0][3:]) != 0:
                        obs = env.reset()
                print('goal', obs['desired_goal'][0])
                episode_reward = 0.0
                frame_idx = 0
                num_episode += 1
                if num_episode >= 1:
                    break
        exit()
        if args.export_gif:
            os.system('ffmpeg -r 5 -start_number 0 -i ' +
                      os.path.dirname(args.load_path) +
                      '/tempimg%d.png -c:v libx264 -pix_fmt yuv420p ' +
                      os.path.join(os.path.dirname(args.load_path), args.env +
                                   '.mp4'))
            for i in range(env_kwargs['max_episode_steps'] * 10):
                # images.append(plt.imread('tempimg' + str(i) + '.png'))
                try:
                    os.remove(
                        os.path.join(os.path.dirname(args.load_path),
                                     'tempimg' + str(i) + '.png'))
                except:
                    pass
Example #11
0
class CustomMlpAggregatePolicy(AggregatePolicy):
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 n_env,
                 n_steps,
                 n_batch,
                 reuse=False,
                 **_kwargs):
        super(CustomMlpAggregatePolicy,
              self).__init__(sess,
                             ob_space,
                             ac_space,
                             n_env,
                             n_steps,
                             n_batch,
                             reuse,
                             layers=[16],
                             feature_extraction="mlp",
                             **_kwargs)


register_policy('SACTwoLayerMlpAggregatePolicy', SACTwoLayerMlpAggregatePolicy)
register_policy('MlpAggregatePolicy', MlpAggregatePolicy)
register_policy('CustomMlpAggregatePolicy', CustomMlpAggregatePolicy)
register_policy('CustomSACPolicy', CustomSACPolicy)
register_policy('CustomDQNPolicy', CustomDQNPolicy)
register_policy('CustomMlpPolicy', CustomMlpPolicy)
            self._qvalue = qvalue_fn[:, 0]
        return self.qvalue_fn

    def step(self, obs, state=None, mask=None):
        return self.sess.run(self.policy, {self.obs_ph: obs})

    def proba_step(self, obs, state=None, mask=None):
        return self.sess.run(self.policy, {self.obs_ph: obs})

    def value(self, obs, action, state=None, mask=None):
        return self.sess.run(self._qvalue, {self.obs_ph: obs, self.action_ph: action})

class LinearPolicy_MLPCritic(LinearPolicy):
    """
    Policy object that implements actor critic, using a MLP (2 layers of 64)
    :param sess: (TensorFlow session) The current TensorFlow session
    :param ob_space: (Gym Space) The observation space of the environment
    :param ac_space: (Gym Space) The action space of the environment
    :param n_env: (int) The number of environments to run
    :param n_steps: (int) The number of steps to run for each environment
    :param n_batch: (int) The number of batch to run (n_envs * n_steps)
    :param reuse: (bool) If the policy is reusable or not
    :param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction
    """

    def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, **_kwargs):
        super(LinearPolicy_MLPCritic, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse,
                                        feature_extraction="mlp", **_kwargs)
        
register_policy("LinearPolicy_MLPCritic", LinearPolicy_MLPCritic)
Example #13
0
                        activation_fn=activ)

                # Output layer
                action_scores = tf.contrib.layers.fully_connected(
                    action_out, num_outputs=self.n_actions, activation_fn=None)

            assert not self.dueling, "Dueling currently not supported"
            q_out = action_scores

        self.q_values = q_out
        self._setup_init()

    def step(self, obs, state=None, mask=None, deterministic=True):
        q_values, actions_proba = self.sess.run(
            [self.q_values, self.policy_proba], {self.obs_ph: obs})
        if deterministic:
            actions = np.argmax(q_values, axis=1)
        else:
            actions = np.zeros((len(obs), ), dtype=np.int64)
            for action_idx in range(len(obs)):
                actions[action_idx] = np.random.choice(
                    self.n_actions, p=actions_proba[action_idx])

        return actions, q_values, None

    def proba_step(self, obs, state=None, mask=None):
        return self.sess.run(self.policy_proba, {self.obs_ph: obs})


register_policy("StatePlusImagePolicy", StatePlusImagePolicy)
Example #14
0
                      retro=False,
                      resolution=84
                      )
        env = AnimalSkip(env, skip=SKIP_FRAMES)                  
        env = AnimalWrapper(env)
        env = AnimalStack(env,VISUAL_FRAMES_COUNT, VEL_FRAMES_COUNT, greyscale=USE_GREYSCALE_OBSES)
        return env
        
    return env

# Define environments
env = create_env_fn(num_actors = 1, inference=False, seed=0)
env = make_vec_env(env, n_envs=4)

# # register policy
register_policy('MyPolicy', LstmPolicy)

# # define algorithm
model = PPO2('MyPolicy', env, n_steps=256)

#########################
# Dataset concatenation #
#########################

def dataset_concatenation(dataset_path):
    '''
    Use only when you have datasets of seperate environments.
    If not, and the code already has a concatenated all_data.npz, ***do not use the function***

    Input: Directory where expert trajectory per environment .npz files are present
    Output: A all_data.npz in the same directory
Example #15
0
from stable_baselines.common.policies import register_policy

from baselines_lab.policies.cnn_policy import SimpleMazeCnnPolicy, GeneralCnnPolicy
from baselines_lab.policies.rnd_policy import RndPolicy
from baselines_lab.policies.deepq import FeedForwardPolicy

register_policy('RndPolicy', RndPolicy)
register_policy('SimpleMazeCnnPolicy', SimpleMazeCnnPolicy)
register_policy('GeneralCnnPolicy', GeneralCnnPolicy)
register_policy('GeneralDqnPolicy', FeedForwardPolicy)
Example #16
0
import os,sys
sys.path.insert(0,'D:\\GitHub\\Quantitative-analysis-with-Deep-Learning\\quantitative_analysis_with_deep_learning')

import gym

from stable_baselines.common.policies import register_policy
from stable_baselines.ddpg.policies import FeedForwardPolicy as DDPG_FeedForwardPolicy
from stable_baselines.td3.policies import FeedForwardPolicy as TD3_FeedForwardPolicy

# 自定义策略和价值网络
class CustomDDPGPolicy(DDPG_FeedForwardPolicy):
    def __init__(self, *args, **kwargs):
        super(CustomDDPGPolicy, self).__init__( *args, **kwargs,
                                            layers=[256, 128, 128, 64],
                                            feature_extraction="mlp")


# Register the policy, it will check that the name is not already taken
register_policy('CustomDDPGPolicy', CustomDDPGPolicy)

class CustomTD3Policy(TD3_FeedForwardPolicy):
    def __init__(self, *args, **kwargs):
        super(CustomTD3Policy, self).__init__( *args, **kwargs,
                                            layers=[256, 128, 128, 64],
                                            feature_extraction="mlp")

register_policy('CustomTD3Policy', CustomTD3Policy)


# Custom LSTM policy with two MLP layers of size 64 each + a shared LSTM layer of size 4
class CustomLSTMPolicy(LstmPolicy):
    def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=4, reuse=False, **_kwargs):
        super().__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse,
                         net_arch=['lstm', dict(pi=[64, 64],
                                            vf=[64, 64])],
                         layer_norm=True, feature_extraction="mlp", **_kwargs)

# Custom MLP policy of two layers of size 81 each
class CustomPolicy_2x81(FeedForwardPolicy):
    def __init__(self, *args, **kwargs):
        super(CustomPolicy_2x81, self).__init__(*args, **kwargs,
                                           net_arch=[dict(pi=[81, 81],
                                                          vf=[81, 81])],
                                           feature_extraction="mlp")

# Custom MLP policy of three layers of with variable size
class CustomPolicy_3_var(FeedForwardPolicy):
    def __init__(self, *args, **kwargs):
        super(CustomPolicy_3_var, self).__init__(*args, **kwargs,
                                           net_arch=[dict(pi=[80, 49, 30],
                                                          vf=[80, 28, 10])],
                                           feature_extraction="mlp")

# Register the policy, it will check that the name is not already taken
register_policy('CustomPolicy_3x64', CustomPolicy_3x64)
register_policy('CustomPolicy_2x64_shared', CustomPolicy_2x64_shared)
register_policy('CustomPolicy_4x128', CustomPolicy_4x128)
register_policy('CustomLSTMPolicy', CustomLSTMPolicy)
register_policy('CustomPolicy_2x81', CustomPolicy_2x81)
register_policy('CustomPolicy_3_var', CustomPolicy_3_var)
Example #18
0

class CustomPolicy(FeedForwardPolicy):
    def __init__(self, *args, **kwargs):
        super(CustomPolicy, self).__init__(*args, **kwargs,
                                           net_arch=[dict(pi=[64],
                                                          vf=[64])],
                                           feature_extraction="cnn", cnn_extractor =custom_cnn )

class DqnCnnPolicy(DqnFFPolicy):
    def __init__(self, *args, **kwargs):
        super(DqnCnnPolicy, self).__init__(*args, **kwargs,
                                           feature_extraction="cnn", cnn_extractor =custom_cnn )

# Register the policy, it will check that the name is not already taken
register_policy('CustomPolicy', CustomPolicy)
register_policy('DqnCnnPolicy', DqnCnnPolicy)


def ppo2train(args):

    with tf.device('/device:CUDA:1'):
        # gpus = tf.config.experimental.list_physical_devices('CUDA')

        # tf.config.experimental.set_visible_devices(gpus[1], 'CUDA')
        env = make_vec_env('python_1p-v0', n_envs=4)

        # env = gym.make('python_1p-v0')
        # env = Monitor(env, filename=None, allow_early_resets=True)
        # env = DummyVecEnv([lambda: env])
Example #19
0
class LargeSACPolicy(SACPolicy):
    def __init__(self, *args, **kwargs):
        super(LargeSACPolicy, self).__init__(*args, **kwargs, layers=[256, 256, 256], feature_extraction="mlp")


class LargeBasePolicy(BasePolicy):
    def __init__(self, *args, **kwargs):
        super(LargeBasePolicy, self).__init__(*args, **kwargs, layers=[256, 256, 256], feature_extraction="mlp")


class MediumBasePolicy(BasePolicy):
    def __init__(self, *args, **kwargs):
        super(MediumBasePolicy, self).__init__(*args, **kwargs, layers=[256, 256], feature_extraction="mlp")


register_policy("CustomSACPolicy", CustomSACPolicy)
register_policy("TinySACPolicy", TinySACPolicy)
register_policy("LargeSACPolicy", LargeSACPolicy)
register_policy("LargeBasePolicy", LargeBasePolicy)
register_policy("MediumBasePolicy", MediumBasePolicy)
register_policy("TinyDQNPolicy", TinyDQNPolicy)
register_policy("MediumDQNPolicy", MediumDQNPolicy)
register_policy("LargeDQNPolicy", LargeDQNPolicy)
register_policy("HugeDQNPolicy", HugeDQNPolicy)
register_policy("BigBigDQNPolicy", BigBigDQNPolicy)
register_policy("BigBigBigDQNPolicy", BigBigBigDQNPolicy)
register_policy("CustomMlpPolicy", CustomMlpPolicy)


def linear_schedule(initial_value):
    """
Example #20
0
                 n_steps,
                 n_batch,
                 num_actions,
                 distributed_single_stream=False,
                 reuse=False,
                 obs_phs=None,
                 dueling=True,
                 **_kwargs):
        super(CnnActPolicy, self).__init__(
            sess,
            ob_space,
            ac_space,
            n_env,
            n_steps,
            n_batch,
            num_actions,
            distributed_single_stream=distributed_single_stream,
            reuse=reuse,
            aggregator='reduceLocalMean',
            feature_extraction="cnn",
            obs_phs=obs_phs,
            dueling=dueling,
            layer_norm=False,
            **_kwargs)


register_policy("CnnActPolicy", MlpActPolicy)
register_policy("MlpActPolicy", MlpActPolicy)
register_policy("LnMlpActPolicy", LnMlpActPolicy)
register_policy("ActionBranching", ActionBranching)
Example #21
0
    :param n_env: (int) The number of environments to run
    :param n_steps: (int) The number of steps to run for each environment
    :param n_batch: (int) The number of batch to run (n_envs * n_steps)
    :param reuse: (bool) If the policy is reusable or not
    :param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction
    """
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 n_env=1,
                 n_steps=1,
                 n_batch=None,
                 reuse=False,
                 **_kwargs):
        super(LnMlpPolicy, self).__init__(sess,
                                          ob_space,
                                          ac_space,
                                          n_env,
                                          n_steps,
                                          n_batch,
                                          reuse,
                                          feature_extraction="mlp",
                                          layer_norm=True,
                                          **_kwargs)


register_policy("C51CnnPolicy", CnnPolicy)
register_policy("C51LnCnnPolicy", LnCnnPolicy)
register_policy("C51MlpPolicy", MlpPolicy)
register_policy("C51LnMlpPolicy", LnMlpPolicy)
Example #22
0
        action = self.deterministic_action if deterministic else self.action
        feed_dict = self._make_feed_dict(obs, state, mask)
        outputs = [action, self.value_flat, self.state_out, self.neglogp]
        if extra_op is not None:
            outputs.append(extra_op)
            a, v, s, neglogp, ex = self.sess.run(outputs, feed_dict)
        else:
            a, v, s, neglogp = self.sess.run(outputs, feed_dict)

        state = []
        for x in s:
            state.append(x.c)
            state.append(x.h)
        state = np.array(state)
        state = np.transpose(state, (1, 0, 2))

        if extra_op is not None:
            return a, v, state, neglogp, ex
        else:
            return a, v, state, neglogp

    def proba_step(self, obs, state=None, mask=None):
        return self.sess.run(self.policy_proba, self._make_feed_dict(obs, state, mask))

    def value(self, obs, state=None, mask=None):
        return self.sess.run(self.value_flat, self._make_feed_dict(obs, state, mask))


register_policy('BansalMlpPolicy', MlpPolicyValue)
register_policy('BansalLstmPolicy', LSTMPolicy)
Example #23
0
                                              **kwargs,
                                              layers=[32, 16],
                                              act_fun=tf.nn.elu,
                                              feature_extraction="mlp")


class CustomDDPGPolicy(DDPGPolicy):
    def __init__(self, *args, **kwargs):
        super(CustomDDPGPolicy, self).__init__(*args,
                                               **kwargs,
                                               layers=[32, 8],
                                               feature_extraction="mlp",
                                               layer_norm=True)


register_policy('CustomDDPGPolicy', CustomDDPGPolicy)
register_policy('LargeSACPolicy', LargeSACPolicy)
register_policy('TinySACPolicy', TinySACPolicy)
register_policy('CustomSACPolicy', CustomSACPolicy)
register_policy('CustomMlpPolicy', CustomMlpPolicy)


def load_vae(path=None, z_size=None):
    """
    :param path: (str)
    :param z_size: (int)
    :return: (VAEController)
    """
    # z_size will be recovered from saved model
    if z_size is None:
        assert path is not None
Example #24
0
                 ob_space,
                 ac_space,
                 n_env,
                 n_steps,
                 n_batch,
                 reuse=False,
                 obs_phs=None,
                 dueling=True,
                 layer_norm=False,
                 l1_regularizer=0.,
                 l2_regularizer=0.,
                 **_kwargs):
        super(CustomRegularizedDQNMlpPolicy,
              self).__init__(sess,
                             ob_space,
                             ac_space,
                             n_env,
                             n_steps,
                             n_batch,
                             reuse,
                             feature_extraction="mlp",
                             obs_phs=obs_phs,
                             dueling=dueling,
                             layer_norm=layer_norm,
                             l1_regularizer=l1_regularizer,
                             l2_regularizer=l2_regularizer,
                             **_kwargs)


register_policy("CustomRegularizedDQNMlpPolicy", CustomRegularizedDQNMlpPolicy)
Example #25
0
    :param n_steps: (int) The number of steps to run for each environment
    :param n_batch: (int) The number of batch to run (n_envs * n_steps)
    :param reuse: (bool) If the policy is reusable or not
    :param _kwargs: (dict) Extra keyword arguments for the nature CNN feature extraction
    """
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 n_env,
                 n_steps,
                 n_batch,
                 reuse=False,
                 **_kwargs):
        super(LnMlpPolicy, self).__init__(sess,
                                          ob_space,
                                          ac_space,
                                          n_env,
                                          n_steps,
                                          n_batch,
                                          reuse,
                                          feature_extraction="mlp",
                                          layer_norm=True,
                                          **_kwargs)


register_policy("CnnPolicy", CnnPolicy)
register_policy("LnCnnPolicy", LnCnnPolicy)
register_policy("MlpPolicy", MlpPolicy)
register_policy("LnMlpPolicy", LnMlpPolicy)
Example #26
0
    def __init__(self, *args, **kwargs):
        super(CustomMlpPolicy, self).__init__(*args,
                                              **kwargs,
                                              layers=[16],
                                              feature_extraction="mlp")


class CustomSACPolicy(SACPolicy):
    def __init__(self, *args, **kwargs):
        super(CustomSACPolicy, self).__init__(*args,
                                              **kwargs,
                                              layers=[256, 256],
                                              feature_extraction="mlp")


register_policy('CustomSACPolicy', CustomSACPolicy)
register_policy('CustomDQNPolicy', CustomDQNPolicy)
register_policy('SmallMobileNetCnnPolicy', SmallMobileNetCnnPolicy)
register_policy('CustomLowerFlopCnnPolicy', CustomLowerFlopCnnPolicy)
register_policy('CustomMlpPolicy', CustomMlpPolicy)


def flatten_dict_observations(env):
    assert isinstance(env.observation_space, gym.spaces.Dict)
    keys = env.observation_space.spaces.keys()
    return gym.wrappers.FlattenDictWrapper(env, dict_keys=list(keys))


def get_wrapper_class(hyperparams):
    """
    Get a Gym environment wrapper class specified as a hyper parameter