Example #1
0
def make_robotics_env(env_id, seed, rank=0):
    """
    Create a wrapped, monitored gym.Env for MuJoCo.
    """
    set_global_seeds(seed)
    env = gym.make(env_id)
    env = FlattenDictWrapper(env, ['observation', 'desired_goal'])
    env = Monitor(
        env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)),
        info_keywords=('is_success',))
    env.seed(seed)
    return env
Example #2
0
def make_robotics_env(env_id, seed, rank=0):
    """
    Create a wrapped, monitored gym.Env for MuJoCo.
    """
    set_global_seeds(seed)
    env = gym.make(env_id)
    env = FlattenDictWrapper(env, ['observation', 'desired_goal'])
    env = Monitor(
        env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)),
        info_keywords=('is_success',))
    env.seed(seed)
    return env
Example #3
0
def make_robotics_env(env_id, seed, rank=0, allow_early_resets=True):
    """
    Create a wrapped, monitored gym.Env for MuJoCo.

    :param env_id: (str) the environment ID
    :param seed: (int) the inital seed for RNG
    :param rank: (int) the rank of the environment (for logging)
    :param allow_early_resets: (bool) allows early reset of the environment
    :return: (Gym Environment) The robotic environment
    """
    set_global_seeds(seed)
    env = gym.make(env_id)
    keys = ['observation', 'desired_goal']
    # TODO: remove try-except once most users are running modern Gym
    try:  # for modern Gym (>=0.15.4)
        from gym.wrappers import FilterObservation, FlattenObservation
        env = FlattenObservation(FilterObservation(env, keys))
    except ImportError:  # for older gym (<=0.15.3)
        from gym.wrappers import FlattenDictWrapper  # pytype:disable=import-error
        env = FlattenDictWrapper(env, keys)
    env = Monitor(env,
                  logger.get_dir()
                  and os.path.join(logger.get_dir(), str(rank)),
                  info_keywords=('is_success', ),
                  allow_early_resets=allow_early_resets)
    env.seed(seed)
    return env
Example #4
0
        def _make_robosuite_env():
            from gym.wrappers import FlattenDictWrapper
            from baselines.bench import Monitor

            env = suite.make(env_id)
            env = FlattenDictWrapper(env, ['robot-state', 'object-state'])
            env = Monitor(env, logger.get_dir(), allow_early_resets=True)
            return env
Example #5
0
def test_flatten2dict():
    dict_env = gym.make('PendulumDictEnv-v0')
    dict_env = GymEnv(dict_env)
    dict_ob = dict_env.observation_space.sample()
    dict_observation_space = dict_env.observation_space
    env = FlattenDictWrapper(dict_env,
                             dict_env.observation_space.spaces.keys())
    flatten_ob = env.observation(dict_ob)
    dict_keys = env.dict_keys
    recovered_dict_ob = flatten_to_dict(flatten_ob, dict_observation_space,
                                        dict_keys)
    tf = []
    for (a_key, a_val), (b_key, b_val) in zip(dict_ob.items(),
                                              recovered_dict_ob.items()):
        tf.append(a_key == b_key)
        tf.append(all(a_val == b_val))
    assert all(tf)
Example #6
0
def make_robotics_env(env_id, seed, rank=0):
    """
    Create a wrapped, monitored gym.Env for MuJoCo.
    
    :param env_id: (str) the environment ID
    :param seed: (int) the inital seed for RNG
    :param rank: (int) the rank of the environment (for logging)
    :return: (Gym Environment) The robotic environment
    """
    set_global_seeds(seed)
    env = gym.make(env_id)
    env = FlattenDictWrapper(env, ['observation', 'desired_goal'])
    env = Monitor(env,
                  logger.get_dir()
                  and os.path.join(logger.get_dir(), str(rank)),
                  info_keywords=('is_success', ))
    env.seed(seed)
    return env
Example #7
0
    def _thunk():
        env = gym.make(env_id)
        if env_id.find('Fetch') == -1:
            env = FlattenObservation(env)
        else:
            env = FlattenDictWrapper(env, ['achieved_goal', 'desired_goal'])
        env = RandomizedEnvWrapper(env, seed + rank)

        env.seed(seed + rank)

        return env
Example #8
0
def environment(spec, kwargs):
    env = FlattenDictWrapper(spec.make(**kwargs), ['observation', 'desired_goal', 'achieved_goal'])
    ob_space = env.observation_space
    act_space = env.action_space
    ob = env.reset()
    assert ob_space.contains(ob), 'Reset observation: {!r} not in space'.format(ob)
    a = act_space.sample()
    observation, reward, done, _info = env.step(a)
    assert ob_space.contains(observation), 'Step observation: {!r} not in space'.format(observation)
    assert np.isscalar(reward), "{} is not a scalar for {}".format(reward, env)
    assert isinstance(done, bool), "Expected {} to be a boolean".format(done)

    for mode in env.metadata.get('render.modes', []):
        env.render(mode=mode)

    # Make sure we can render the environment after close.
    for mode in env.metadata.get('render.modes', []):
        env.render(mode=mode)

    env.close()
Example #9
0
    def run_games_for_agent(self, agent_number, agent_class):
        """Runs a set of games for a given agent, saving the results in self.results"""
        agent_results = []
        agent_name = agent_class.agent_name
        agent_group = self.agent_to_agent_group[agent_name]
        agent_round = 1
        # print("!!", self.config.environment)
        # print(self.config.environment._max_episode_steps)

        #&&&&&&&&&&&&
        agent_config = copy.deepcopy(self.config)

        if self.environment_has_changeable_goals(agent_config.environment) \
                and self.agent_cant_handle_changeable_goals_without_flattening(agent_name):
            print("Flattening changeable-goal environment for agent {}".format(
                agent_name))
            agent_config.environment = FlattenDictWrapper(
                agent_config.environment,
                dict_keys=["observation", "desired_goal"])

        if self.config.randomise_random_seed:
            agent_config.seed = random.randint(0, 2**32 - 2)
        agent_config.hyperparameters = agent_config.hyperparameters[
            agent_group]
        print("AGENT NAME: {}".format(agent_name))

        manager = mp.Manager()
        return_q = manager.Queue()
        agent = agent_class(agent_config)
        self.environment_name = agent.environment_title
        jobs = []
        for i in range(self.config.runs_per_agent):
            p = mp.Process(target=agent.run_n_episodes, args=(return_q, ))
            jobs.append(p)
            p.start()

        for proc in jobs:
            proc.join()

        # print("(GridTrainer.py) process end!")
        for game_scores, rolling_scores, time_taken in iter(
                return_q.get, None):
            agent_results.append([
                game_scores, rolling_scores,
                len(rolling_scores), -1 * max(rolling_scores), time_taken
            ])
            if return_q.empty():
                break
        #&&&&&&&&&
        self.results[agent_name] = agent_results
Example #10
0
def _make_env(env_id, env_type, seed, reward_shaping, frame_stack, **kwargs):
    """Make single env"""
    check_name_in_list(env_id,
                       env_type)  # check existence of env_id in env_type
    if env_type == 'atari':
        env = gym.make(env_id)
        env = NoopResetEnv(env, noop_max=30)
        if 'NoFrameskip' in env.spec.id:
            env = MaxAndSkipEnv(env, skip=4)
        env = Monitor(env)
        # deepmind wrap
        env = EpisodicLifeEnv(env)
        if 'FIRE' in env.unwrapped.get_action_meanings():
            env = FireResetEnv(env)
        env = WarpFrame(env)
        env = ClipRewardEnv(env)
        if frame_stack:
            env = FrameStack(env, 4)
    elif env_type in ['classic_control', 'box2d', 'mujoco']:
        env = gym.make(env_id).unwrapped
        max_episode_steps = kwargs.get('max_episode_steps')
        if max_episode_steps is not None:
            env = TimeLimit(env.unwrapped, max_episode_steps)
        env = Monitor(env)
    elif env_type == 'robotics':
        env = gym.make(env_id)
        env = FlattenDictWrapper(env, ['observation', 'desired_goal'])
        env = Monitor(env, info_keywords=('is_success', ))
    elif env_type == 'dm_control':
        env = gym.make('dm2gym:' + env_id,
                       environment_kwargs={'flat_observation': True})
        env = DmObsTrans(env)
    elif env_type == 'rlbench':
        from rlzoo.common.build_rlbench_env import RLBenchEnv
        state_type = kwargs.get('state_type')
        env = RLBenchEnv(env_id) if state_type is None else RLBenchEnv(
            env_id, state_type)
    else:
        raise NotImplementedError

    if reward_shaping is not None:
        if callable(reward_shaping):
            env = RewardShaping(env, reward_shaping)
        else:
            raise ValueError('reward_shaping parameter must be callable')
    env.seed(seed)
    return env
Example #11
0
def random_rollout(spec, kwargs):
    env = FlattenDictWrapper(spec.make(**kwargs), ['observation', 'desired_goal', 'achieved_goal'])
    agent = lambda ob: env.action_space.sample()
    ob = env.reset()
    for _ in range(10):
        assert env.observation_space.contains(ob)
        a = agent(ob)
        assert env.action_space.contains(a)
        (ob, _reward, done, _info) = env.step(a)
        if done:
            break
    env.close()
def make_env(env_id,
             rank,
             log_dir=None,
             allow_early_resets=True,
             flatten_dict=False,
             kwargs=None):
    """
    Create a wrapped, monitored gym.Env for MuJoCo.

    :param env_id: (str) the environment ID
    :param seed: (int) the inital seed for RNG
    :param allow_early_resets: (bool) allows early reset of the environment
    :return: (Gym Environment) The mujoco environment
    """
    if env_id in ENTRY_POINT.keys():
        kwargs = kwargs.copy()
        max_episode_steps = None
        if 'max_episode_steps' in kwargs:
            max_episode_steps = kwargs['max_episode_steps']
            del kwargs['max_episode_steps']
        gym.register(env_id,
                     entry_point=ENTRY_POINT[env_id],
                     max_episode_steps=max_episode_steps,
                     kwargs=kwargs)
        env = gym.make(env_id)
    else:
        raise NotImplementedError
    if flatten_dict:
        env = FlattenDictWrapper(
            env, ['observation', 'achieved_goal', 'desired_goal'])
    if 'FetchStack' in env_id and (
            'Unlimit' not in env_id) and max_episode_steps is None:
        from utils.wrapper import FlexibleTimeLimitWrapper
        env = FlexibleTimeLimitWrapper(env, 100)
    if kwargs['reward_type'] != 'sparse':
        env = DoneOnSuccessWrapper(env, 0.0)
    else:
        env = DoneOnSuccessWrapper(env)
    if log_dir is not None:
        env = Monitor(env,
                      os.path.join(log_dir,
                                   str(rank) + ".monitor.csv"),
                      allow_early_resets=allow_early_resets,
                      info_keywords=('is_success', ))
    return env
Example #13
0
def make_robotics_env(env_id, seed, rank=0):
    """
    Create a wrapped, monitored gym.Env for MuJoCo.
    """
    set_global_seeds(seed)
    env = gym.make(env_id)
    #env = FlattenDictWrapper(env, ['observation', 'desired_goal'])
    keys = ['observation', 'desired_goal']
    # TODO: remove try-except once most users are running modern Gym
    try:  # for modern Gym (>=0.15.4)
        from gym.wrappers import FilterObservation, FlattenObservation
        env = FlattenObservation(FilterObservation(env, keys))
    except ImportError:  # for older gym (<=0.15.3)
        from gym.wrappers import FlattenDictWrapper  # pytype:disable=import-error
        env = FlattenDictWrapper(env, keys)
    env = Monitor(env,
                  logger.get_dir()
                  and os.path.join(logger.get_dir(), str(rank)),
                  info_keywords=('is_success', ))
    env.seed(seed)
    return env
Example #14
0
def create_environment(name: str) -> gym.Env:
    print('Creating environment %s...' % name)
    ids = name.split('-')
    framework = ids[0].lower()
    env_id = '-'.join(ids[1:])
    if framework == 'dm':
        from envs.deepmind import DMSuiteEnv
        return DMSuiteEnv(env_id)
    elif framework == 'gym':
        env = gym.make(env_id).env
        from gym.envs.robotics.robot_env import RobotEnv
        if not isinstance(env, RobotEnv):
            env = BetterRgbRenderingEnv(env)
        if isinstance(env.observation_space, gym.spaces.Dict):
            from gym.wrappers import FlattenDictWrapper
            env = FlattenDictWrapper(env, env.observation_space.spaces.keys())
        return env
    elif framework == 'rllab':
        from envs.rllab import RllabEnv
        return RllabEnv(env_id)

    raise LookupError("Could not find environment \"%s\"." % env_id)
Example #15
0
    def _thunk():
        if env_id.startswith("dm"):
            _, domain, task = env_id.split('.')
            #            env=suite.load(domain,task,environment_kwargs=dict(flat_observation=True))
            #           env=DMControlEnv(env)
            p = "dm2gym:" + domain.capitalize() + task.capitalize() + "-v0"
            env = gym.make(p, environment_kwargs=dict(flat_observation=True))
            env = FlattenDictWrapper(env, ['observations'])
        else:
            env = gym.make(env_id)

        is_atari = hasattr(gym.envs, 'atari') and isinstance(
            env.unwrapped, gym.envs.atari.atari_env.AtariEnv)
        if is_atari:
            env = make_atari(env_id)

        env.seed(seed + rank)

        obs_shape = env.observation_space.shape

        if str(env.__class__.__name__).find('TimeLimit') >= 0:
            env = TimeLimitMask(env)

        if log_dir is not None:
            env = bench.Monitor(env,
                                os.path.join(log_dir, str(rank)),
                                allow_early_resets=allow_early_resets)

        if is_atari:
            if len(env.observation_space.shape) == 3:
                env = wrap_deepmind(env)
        elif len(env.observation_space.shape) == 3:
            raise NotImplementedError(
                "CNN models work only for atari,\n"
                "please use a custom wrapper for a custom pixel input env.\n"
                "See wrap_deepmind for an example.")

        # If the input has shape (W,H,3), wrap for PyTorch convolutions
        obs_shape = env.observation_space.shape
        if len(obs_shape) == 3 and obs_shape[2] in [1, 3]:
            env = TransposeImage(env, op=[2, 0, 1])

        return env
def main(args):
    log_dir = args.log_path if (
        args.log_path is not None
    ) else "/tmp/stable_baselines_" + time.strftime('%Y-%m-%d-%H-%M-%S')
    if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
        rank = 0
        configure_logger(log_dir)
    else:
        rank = MPI.COMM_WORLD.Get_rank()
        configure_logger(log_dir, format_strs=[])

    set_global_seeds(args.seed)

    model_class = SAC_parallel

    n_workers = args.num_workers if not args.play else 1
    env_kwargs = get_env_kwargs(args.env,
                                random_ratio=args.random_ratio,
                                sequential=args.sequential,
                                reward_type=args.reward_type,
                                n_object=args.n_object)

    def make_thunk(rank):
        return lambda: make_env(
            env_id=args.env, rank=rank, log_dir=log_dir, kwargs=env_kwargs)

    env = ParallelSubprocVecEnv([make_thunk(i) for i in range(n_workers)],
                                reset_when_done=True)

    if os.path.exists(os.path.join(logger.get_dir(), 'eval.csv')):
        os.remove(os.path.join(logger.get_dir(), 'eval.csv'))
        print('Remove existing eval.csv')
    eval_env_kwargs = env_kwargs.copy()
    eval_env_kwargs['random_ratio'] = 0.0
    eval_env = make_env(env_id=args.env, rank=0, kwargs=eval_env_kwargs)
    eval_env = FlattenDictWrapper(
        eval_env, ['observation', 'achieved_goal', 'desired_goal'])

    if not args.play:
        os.makedirs(log_dir, exist_ok=True)

    # Available strategies (cf paper): future, final, episode, random
    goal_selection_strategy = 'future'  # equivalent to GoalSelectionStrategy.FUTURE

    if not args.play:
        from stable_baselines.ddpg.noise import NormalActionNoise
        noise_type = args.action_noise.split('_')[0]
        if noise_type == 'none':
            parsed_action_noise = None
        elif noise_type == 'normal':
            sigma = float(args.action_noise.split('_')[1])
            parsed_action_noise = NormalActionNoise(
                mean=np.zeros(env.action_space.shape),
                sigma=sigma * np.ones(env.action_space.shape))
        else:
            raise NotImplementedError
        train_kwargs = get_train_kwargs("sac", args, parsed_action_noise,
                                        eval_env)

        def callback(_locals, _globals):
            if _locals['step'] % int(1e3) == 0:
                if 'FetchStack' in args.env:
                    mean_eval_reward = stack_eval_model(
                        eval_env,
                        _locals["self"],
                        init_on_table=(args.env == 'FetchStack-v2'))
                elif 'MasspointPushDoubleObstacle-v2' in args.env:
                    mean_eval_reward = egonav_eval_model(
                        eval_env,
                        _locals["self"],
                        env_kwargs["random_ratio"],
                        fixed_goal=np.array([4., 4., 0.15, 0., 0., 0., 1.]))
                    mean_eval_reward2 = egonav_eval_model(
                        eval_env,
                        _locals["self"],
                        env_kwargs["random_ratio"],
                        goal_idx=0,
                        fixed_goal=np.array([4., 4., 0.15, 1., 0., 0., 0.]))
                    log_eval(_locals['self'].num_timesteps,
                             mean_eval_reward2,
                             file_name="eval_box.csv")
                else:
                    mean_eval_reward = eval_model(eval_env, _locals["self"])
                log_eval(_locals['self'].num_timesteps, mean_eval_reward)
            if _locals['step'] % int(2e4) == 0:
                model_path = os.path.join(
                    log_dir, 'model_' + str(_locals['step'] // int(2e4)))
                model.save(model_path)
                print('model saved to', model_path)
            return True

        class CustomSACPolicy(SACPolicy):
            def __init__(self, *model_args, **model_kwargs):
                super(CustomSACPolicy, self).__init__(
                    *model_args,
                    **model_kwargs,
                    layers=[256, 256] if 'MasspointPushDoubleObstacle'
                    in args.env else [256, 256, 256, 256],
                    feature_extraction="mlp")

        register_policy('CustomSACPolicy', CustomSACPolicy)
        from utils.sac_attention_policy import AttentionPolicy
        register_policy('AttentionPolicy', AttentionPolicy)
        policy_kwargs = get_policy_kwargs("sac", args)

        if rank == 0:
            print('train_kwargs', train_kwargs)
            print('policy_kwargs', policy_kwargs)
        # Wrap the model
        model = HER2(args.policy,
                     env,
                     model_class,
                     n_sampled_goal=4,
                     goal_selection_strategy=goal_selection_strategy,
                     num_workers=args.num_workers,
                     policy_kwargs=policy_kwargs,
                     verbose=1,
                     **train_kwargs)
        print(model.get_parameter_list())

        # Train the model
        model.learn(
            int(args.num_timesteps),
            seed=args.seed,
            callback=callback,
            log_interval=100 if not ('MasspointMaze-v3' in args.env) else 10)

        if rank == 0:
            model.save(os.path.join(log_dir, 'final'))

    # WARNING: you must pass an env
    # or wrap your environment with HERGoalEnvWrapper to use the predict method
    if args.play and rank == 0:
        assert args.load_path is not None
        model = HER2.load(args.load_path, env=env)

        fig, ax = plt.subplots(1, 1, figsize=(8, 8))
        obs = env.reset()
        if 'FetchStack' in args.env:
            env.env_method('set_task_array',
                           [[(env.get_attr('n_object')[0], 0)]])
            obs = env.reset()
            while env.get_attr('current_nobject')[0] != env.get_attr(
                    'n_object')[0] or env.get_attr('task_mode')[0] != 1:
                obs = env.reset()
        elif 'FetchPushWallObstacle' in args.env:
            while not (obs['observation'][0][4] > 0.7
                       and obs['observation'][0][4] < 0.8):
                obs = env.reset()
            env.env_method('set_goal', [np.array([1.18, 0.8, 0.425, 1, 0])])
            obs = env.env_method('get_obs')
            obs = {
                'observation': obs[0]['observation'][None],
                'achieved_goal': obs[0]['achieved_goal'][None],
                'desired_goal': obs[0]['desired_goal'][None]
            }
            # obs[0] = np.concatenate([obs[0][key] for key in ['observation', 'achieved_goal', 'desired_goal']])
        elif 'MasspointPushDoubleObstacle' in args.env or 'FetchPushWallObstacle' in args.env:
            while np.argmax(obs['desired_goal'][0][3:]) != 0:
                obs = env.reset()
        elif 'MasspointMaze-v2' in args.env:
            while obs['observation'][0][0] < 3 or obs['observation'][0][1] < 3:
                obs = env.reset()
            env.env_method('set_goal', [np.array([1., 1., 0.15])])
            obs = env.env_method('get_obs')
            obs = {
                'observation': obs[0]['observation'][None],
                'achieved_goal': obs[0]['achieved_goal'][None],
                'desired_goal': obs[0]['desired_goal'][None]
            }

        print('goal', obs['desired_goal'][0], 'obs', obs['observation'][0])
        episode_reward = 0.0
        images = []
        frame_idx = 0
        num_episode = 0
        for i in range(env_kwargs['max_episode_steps'] * 10):
            img = env.render(mode='rgb_array')
            ax.cla()
            ax.imshow(img)
            tasks = ['pick and place', 'stack']
            ax.set_title('episode ' + str(num_episode) + ', frame ' +
                         str(frame_idx) + ', task: ' +
                         tasks[np.argmax(obs['observation'][0][-2:])])
            images.append(img)
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, _ = env.step(action)
            episode_reward += reward
            frame_idx += 1
            if args.export_gif:
                plt.imsave(
                    os.path.join(os.path.dirname(args.load_path),
                                 'tempimg%d.png' % i), img)
            else:
                plt.pause(0.02)
            if done:
                print('episode_reward', episode_reward)
                obs = env.reset()
                if 'FetchStack' in args.env:
                    while env.get_attr('current_nobject')[0] != env.get_attr('n_object')[0] or \
                                    env.get_attr('task_mode')[0] != 1:
                        obs = env.reset()
                elif 'MasspointPushDoubleObstacle' in args.env or 'FetchPushWallObstacle' in args.env:
                    while np.argmax(obs['desired_goal'][0][3:]) != 0:
                        obs = env.reset()
                print('goal', obs['desired_goal'][0])
                episode_reward = 0.0
                frame_idx = 0
                num_episode += 1
                if num_episode >= 1:
                    break
        exit()
        if args.export_gif:
            os.system('ffmpeg -r 5 -start_number 0 -i ' +
                      os.path.dirname(args.load_path) +
                      '/tempimg%d.png -c:v libx264 -pix_fmt yuv420p ' +
                      os.path.join(os.path.dirname(args.load_path), args.env +
                                   '.mp4'))
            for i in range(env_kwargs['max_episode_steps'] * 10):
                # images.append(plt.imread('tempimg' + str(i) + '.png'))
                try:
                    os.remove(
                        os.path.join(os.path.dirname(args.load_path),
                                     'tempimg' + str(i) + '.png'))
                except:
                    pass
Example #17
0
def main(args):
    log_dir = args.log_path if (
        args.log_path is not None
    ) else "/tmp/stable_baselines_" + time.strftime('%Y-%m-%d-%H-%M-%S')
    if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
        rank = 0
        configure_logger(log_dir)
    else:
        rank = MPI.COMM_WORLD.Get_rank()
        configure_logger(log_dir, format_strs=[])

    set_global_seeds(args.seed)

    model_class = SAC_SIR  # works also with SAC, DDPG and TD3

    env_kwargs = get_env_kwargs(args.env,
                                random_ratio=args.random_ratio,
                                sequential=args.sequential,
                                reward_type=args.reward_type,
                                n_object=args.n_object)

    def make_thunk(rank):
        return lambda: make_env(
            env_id=args.env, rank=rank, log_dir=log_dir, kwargs=env_kwargs)

    env = ParallelSubprocVecEnv(
        [make_thunk(i) for i in range(args.num_workers)], reset_when_done=True)

    def make_thunk_aug(rank):
        return lambda: FlattenDictWrapper(
            make_env(env_id=aug_env_name, rank=rank, kwargs=aug_env_kwargs),
            ['observation', 'achieved_goal', 'desired_goal'])

    aug_env_kwargs = env_kwargs.copy()
    del aug_env_kwargs['max_episode_steps']
    aug_env_name = args.env.split('-')[0] + 'Unlimit-' + args.env.split('-')[1]
    aug_env = ParallelSubprocVecEnv(
        [make_thunk_aug(i) for i in range(args.num_workers)],
        reset_when_done=False)

    if os.path.exists(os.path.join(logger.get_dir(), 'eval.csv')):
        os.remove(os.path.join(logger.get_dir(), 'eval.csv'))
        print('Remove existing eval.csv')
    eval_env_kwargs = env_kwargs.copy()
    eval_env_kwargs['random_ratio'] = 0.0
    eval_env = make_env(env_id=args.env, rank=0, kwargs=eval_env_kwargs)
    eval_env = FlattenDictWrapper(
        eval_env, ['observation', 'achieved_goal', 'desired_goal'])

    if not args.play:
        os.makedirs(log_dir, exist_ok=True)

    # Available strategies (cf paper): future, final, episode, random
    goal_selection_strategy = 'future'  # equivalent to GoalSelectionStrategy.FUTURE

    if not args.play:
        from stable_baselines.ddpg.noise import NormalActionNoise
        noise_type = args.action_noise.split('_')[0]
        if noise_type == 'none':
            parsed_action_noise = None
        elif noise_type == 'normal':
            sigma = float(args.action_noise.split('_')[1])
            parsed_action_noise = NormalActionNoise(
                mean=np.zeros(env.action_space.shape),
                sigma=sigma * np.ones(env.action_space.shape))
        else:
            raise NotImplementedError

        train_kwargs = get_train_kwargs("sac_sir", args, parsed_action_noise,
                                        eval_env, aug_env)

        def callback(_locals, _globals):
            if _locals['step'] % int(1e3) == 0:
                if 'FetchStack' in args.env:
                    mean_eval_reward = stack_eval_model(
                        eval_env,
                        _locals["self"],
                        init_on_table=(args.env == 'FetchStack-v2'))
                elif 'MasspointPushDoubleObstacle-v2' in args.env:
                    mean_eval_reward = egonav_eval_model(
                        eval_env,
                        _locals["self"],
                        env_kwargs["random_ratio"],
                        fixed_goal=np.array([4., 4., 0.15, 0., 0., 0., 1.]))
                    mean_eval_reward2 = egonav_eval_model(
                        eval_env,
                        _locals["self"],
                        env_kwargs["random_ratio"],
                        goal_idx=0,
                        fixed_goal=np.array([4., 4., 0.15, 1., 0., 0., 0.]))
                    log_eval(_locals['self'].num_timesteps,
                             mean_eval_reward2,
                             file_name="eval_box.csv")
                else:
                    mean_eval_reward = eval_model(eval_env, _locals["self"])
                log_eval(_locals['self'].num_timesteps, mean_eval_reward)
            if _locals['step'] % int(2e4) == 0:
                model_path = os.path.join(
                    log_dir, 'model_' + str(_locals['step'] // int(2e4)))
                model.save(model_path)
                print('model saved to', model_path)
            return True

        class CustomSACPolicy(SACPolicy):
            def __init__(self, *model_args, **model_kwargs):
                super(CustomSACPolicy, self).__init__(
                    *model_args,
                    **model_kwargs,
                    layers=[256, 256] if 'MasspointPushDoubleObstacle'
                    in args.env else [256, 256, 256, 256],
                    feature_extraction="mlp")

        register_policy('CustomSACPolicy', CustomSACPolicy)
        from utils.sac_attention_policy import AttentionPolicy
        register_policy('AttentionPolicy', AttentionPolicy)
        policy_kwargs = get_policy_kwargs("sac_sir", args)

        if rank == 0:
            print('train_kwargs', train_kwargs)
            print('policy_kwargs', policy_kwargs)
        # Wrap the model
        model = HER2(args.policy,
                     env,
                     model_class,
                     n_sampled_goal=4,
                     start_augment_time=args.start_augment,
                     goal_selection_strategy=goal_selection_strategy,
                     num_workers=args.num_workers,
                     policy_kwargs=policy_kwargs,
                     verbose=1,
                     **train_kwargs)
        print(model.get_parameter_list())

        # Train the model
        model.learn(
            int(args.num_timesteps),
            seed=args.seed,
            callback=callback,
            log_interval=100 if not ('MasspointMaze-v3' in args.env) else 10)

        if rank == 0:
            model.save(os.path.join(log_dir, 'final'))
Example #18
0
 def make_thunk_aug(rank):
     return lambda: FlattenDictWrapper(
         make_env(env_id=aug_env_name, rank=rank, kwargs=aug_env_kwargs),
         ['observation', 'achieved_goal', 'desired_goal'])
Example #19
0
def load(environment_name,
         env_id=None,
         concat_desired_goal=True,
         discount=1.0,
         max_episode_steps=None,
         sparse_reward=False,
         use_success_wrapper=True,
         gym_env_wrappers=(),
         alf_env_wrappers=(),
         wrap_with_process=False):
    """Loads the selected environment and wraps it with the specified wrappers.

    Note that by default a ``TimeLimit`` wrapper is used to limit episode lengths
    to the default benchmarks defined by the registered environments.

    Args:
        environment_name: Name for the environment to load.
        env_id: A scalar ``Tensor`` of the environment ID of the time step.
        discount: Discount to use for the environment.
        max_episode_steps: If None the ``max_episode_steps`` will be set to the default
            step limit defined in the environment's spec. No limit is applied if set
            to 0 or if there is no ``timestep_limit`` set in the environment's spec.
        sparse_reward (bool): If True, the game ends once the goal is achieved.
            Rewards will be added by 1, changed from -1/0 to 0/1.
        use_success_wrapper (bool): If True, wraps the environment with the
            SuccessWrapper which will record Success info after a specified
            amount of timesteps.
        gym_env_wrappers: Iterable with references to wrapper classes to use
            directly on the gym environment.
        alf_env_wrappers: Iterable with references to wrapper classes to use on
            the torch environment.

    Returns:
        An AlfEnvironment instance.
    """
    assert (environment_name.startswith("Fetch")
            or environment_name.startswith("HandManipulate")), (
                "This suite only supports OpenAI's Fetch and ShadowHand envs!")

    _unwrapped_env_checker_.check_and_update(wrap_with_process)

    gym_spec = gym.spec(environment_name)
    env = gym_spec.make()

    if max_episode_steps is None:
        if gym_spec.max_episode_steps is not None:
            max_episode_steps = gym_spec.max_episode_steps
        else:
            max_episode_steps = 0

    def env_ctor(env_id=None):
        return suite_gym.wrap_env(
            env,
            env_id=env_id,
            discount=discount,
            max_episode_steps=max_episode_steps,
            gym_env_wrappers=gym_env_wrappers,
            alf_env_wrappers=alf_env_wrappers,
            image_channel_first=False)

    # concat robot's observation and the goal location
    if concat_desired_goal:
        keys = ["observation", "desired_goal"]
        try:  # for modern Gym (>=0.15.4)
            from gym.wrappers import FilterObservation, FlattenObservation
            env = FlattenObservation(FilterObservation(env, keys))
        except ImportError:  # for older gym (<=0.15.3)
            from gym.wrappers import FlattenDictWrapper  # pytype:disable=import-error
            env = FlattenDictWrapper(env, keys)
    if use_success_wrapper:
        env = SuccessWrapper(env, max_episode_steps)
    env = ObservationClipWrapper(env)
    if sparse_reward:
        env = SparseReward(env)

    if wrap_with_process:
        process_env = process_environment.ProcessEnvironment(
            functools.partial(env_ctor))
        process_env.start()
        torch_env = alf_wrappers.AlfEnvironmentBaseWrapper(process_env)
    else:
        torch_env = env_ctor(env_id=env_id)

    return torch_env
Example #20
0
    def run_games_for_agent(self, agent_number, agent_class):
        """Runs a set of games for a given agent, saving the results in self.results"""
        agent_results = []
        agent_name = agent_class.agent_name
        agent_group = self.agent_to_agent_group[agent_name]
        agent_round = 1
        # print("!!", self.config.environment)
        # print(self.config.environment._max_episode_steps)
        for run in range(self.config.runs_per_agent):
            agent_config = copy.deepcopy(self.config)

            if self.environment_has_changeable_goals(agent_config.environment) \
                and self.agent_cant_handle_changeable_goals_without_flattening(agent_name):
                print("Flattening changeable-goal environment for agent {}".
                      format(agent_name))
                agent_config.environment = FlattenDictWrapper(
                    agent_config.environment,
                    dict_keys=["observation", "desired_goal"])
            # print("!!!", agent_config.environment)
            # print(agent_config.environment.env._max_episode_steps)

            if self.config.randomise_random_seed:
                agent_config.seed = random.randint(0, 2**32 - 2)
            agent_config.hyperparameters = agent_config.hyperparameters[
                agent_group]
            print("AGENT NAME: {}".format(agent_name))
            print("\033[1m" +
                  "{}.{}: {}".format(agent_number, agent_round, agent_name) +
                  "\033[0m",
                  flush=True)
            agent = agent_class(agent_config)
            self.environment_name = agent.environment_title
            print(agent.hyperparameters)
            print("RANDOM SEED ", agent_config.seed)
            game_scores, rolling_scores, time_taken = agent.run_n_episodes(
            )  ##************
            print("Time taken: {}".format(time_taken), flush=True)
            self.print_two_empty_lines()

            episode_succeded = agent.achieved_required_score_at_index()
            if episode_succeded >= 0 and episode_succeded <= 1:
                # we will not accept runs that episode succeeded too early it is an anomaly
                print(
                    "Since this run succeeded at episode: {}, it will be neglected"
                    .format(episode_succeded))
                # print("The initial state of the anomaly is:")
                # print("list: ", agent.initial_state_list)
                # print(agent.initial_state_list[episode_succeded])
                # print("Recording the anomaly...")
                #
                # f = open("Anomalies.txt", 'a')
                # f2 = open("Anomaly_list.txt", 'a')
                # string = "[" + datetime.datetime.now().strftime("%Y-%m-%d %H:%M") + "Anamaly!"
                # string += "] [Agent: {}".format(agent_name)+"]\n"
                # string += "RANDOM SEED: {}".format(agent_config.seed) + "\n"
                # string += "Initial State: {}".format(agent.initial_state_list[episode_succeded])+"\n"
                # if("HER" in agent_name):
                #    string += "It is HER Case, L2 Norm is : "
                #    string += str(LA.norm(agent.initial_state_list[episode_succeded]["achieved_goal"] - agent.initial_state_list[episode_succeded]["desired_goal"]))
                #    f2.write(str(LA.norm(agent.initial_state_list[episode_succeded]["achieved_goal"] - agent.initial_state_list[episode_succeded]["desired_goal"]))+"\n")
                #    string += "\n--------------------------\n"
                # f.write(string)
                # f.close()
                # f2.close()

            else:
                agent_results.append([
                    game_scores, rolling_scores,
                    len(rolling_scores), -1 * max(rolling_scores), time_taken
                ])

                # if ("HER" in agent_name):
                #     f = open("Anomalies.txt", 'a')
                #     f2 = open("Normallist.txt", 'a')
                #     string1 = "[" + datetime.datetime.now().strftime("%Y-%m-%d %H:%M") + "Normal Case in HER!"
                #     string1 += "] [Agent: {}".format(agent_name) + "]\n"
                #     string1 += "RANDOM SEED: {}".format(agent_config.seed) + "\n"
                #     string1 += "Initial State: {}".format(agent.initial_state_list[episode_succeded]) + "\n"
                #     string1 += "It is HER Case, L2 Norm is : \n"
                #     string1 += str(LA.norm(agent.initial_state_list[episode_succeded]["achieved_goal"] -
                #                       agent.initial_state_list[episode_succeded]["desired_goal"]))
                #     string1 +=  "\n----------------------------\n"
                #     f.write(string1)
                #     f2.write(str(LA.norm(agent.initial_state_list[episode_succeded]["achieved_goal"] - agent.initial_state_list[episode_succeded]["desired_goal"]))+"\n")
                #     f.close()
                #     f2.close()

            if self.config.visualise_individual_results:
                self.visualise_overall_agent_results([rolling_scores],
                                                     agent_name,
                                                     show_each_run=True)
                #plt.show()
            agent_round += 1
        '''Saving Videos!'''
        # h, w, _ = self.render_file_list[0][0].shape
        # size = (w, h)
        # out = cv2.VideoWriter('videos/{}_{}.avi'.format(self.environment_name,agent_class.agent_name),
        #                       cv2.VideoWriter_fourcc(*'DIVX'), 120, size)
        # for i in range(len(self.render_file_list)):
        #     for j in range(len(self.render_file_list[0])):
        #         out.write(self.render_file_list[i][j])
        # out.release()
        # print("Saving Complete!!")
        self.results[agent_name] = agent_results
def make_env(env_id,
             env_type,
             mpi_rank=0,
             subrank=0,
             seed=None,
             reward_scale=1.0,
             gamestate=None,
             flatten_dict_observations=True,
             wrapper_kwargs=None,
             env_kwargs=None,
             logger_dir=None,
             initializer=None):
    if initializer is not None:
        initializer(mpi_rank=mpi_rank, subrank=subrank)

    wrapper_kwargs = wrapper_kwargs or {}
    env_kwargs = env_kwargs or {}
    if ':' in env_id:
        import re
        import importlib
        module_name = re.sub(':.*', '', env_id)
        env_id = re.sub('.*:', '', env_id)
        importlib.import_module(module_name)
    if env_type == 'atari':
        env = make_atari(env_id)
    elif env_type == 'retro':
        import retro
        gamestate = gamestate or retro.State.DEFAULT
        env = retro_wrappers.make_retro(
            game=env_id,
            max_episode_steps=10000,
            use_restricted_actions=retro.Actions.DISCRETE,
            state=gamestate)
    elif env_type == 'robotics':
        env = gym.make(env_id)
        env = FlattenDictWrapper(
            env, ['observation', 'achieved_goal', 'desired_goal'])
    else:
        if env_id == 'LunarLanderContinuousPOMDP-v0':
            new_lunar_lander_pomdp_env(hist_len=hist_len,
                                       block_high=block_high,
                                       not_guided=not_guided,
                                       give_state=give_state)
        else:
            env = gym.make(env_id, **env_kwargs)

    if flatten_dict_observations and isinstance(env.observation_space,
                                                gym.spaces.Dict):
        keys = env.observation_space.spaces.keys()
        env = gym.wrappers.FlattenDictWrapper(env, dict_keys=list(keys))

    env.seed(seed + subrank if seed is not None else None)
    env = Monitor(env,
                  logger_dir
                  and os.path.join(logger_dir,
                                   str(mpi_rank) + '.' + str(subrank)),
                  allow_early_resets=True)

    if env_type == 'atari':
        env = wrap_deepmind(env, **wrapper_kwargs)
    elif env_type == 'retro':
        if 'frame_stack' not in wrapper_kwargs:
            wrapper_kwargs['frame_stack'] = 1
        env = retro_wrappers.wrap_deepmind_retro(env, **wrapper_kwargs)

    if isinstance(env.action_space, gym.spaces.Box):
        env = ClipActionsWrapper(env)

    if reward_scale != 1:
        env = retro_wrappers.RewardScaler(env, reward_scale)

    return env
Example #22
0
 def setUp(self):
     dict_env = gym.make('PendulumDictEnv-v0')
     self.dict_observation_space = dict_env.observation_space
     env = FlattenDictWrapper(dict_env,
                              dict_env.observation_space.spaces.keys())
     self.env = GymEnv(env)
Example #23
0
    env_id = sys.argv[1]
    model_path = sys.argv[2]
    env_kwargs = get_env_kwargs(env_id, random_ratio=0.0)

    def make_thunk(rank):
        return lambda: make_env(env_id=env_id, rank=rank, kwargs=env_kwargs)

    env = ParallelSubprocVecEnv([make_thunk(i) for i in range(1)],
                                reset_when_done=True)

    aug_env_id = env_id.split('-')[0] + 'Unlimit-' + env_id.split('-')[1]
    aug_env_kwargs = env_kwargs.copy()
    aug_env_kwargs['max_episode_steps'] = None

    aug_env = make_env(aug_env_id, rank=0, kwargs=aug_env_kwargs)
    aug_env = FlattenDictWrapper(
        aug_env, ['observation', 'achieved_goal', 'desired_goal'])

    goal_dim = aug_env.goal.shape[0]
    obs_dim = aug_env.observation_space.shape[0] - 2 * goal_dim
    noise_mag = aug_env.size_obstacle[1]
    n_object = aug_env.n_object
    model = HER2.load(model_path, env=env)
    model.model.env_id = env_id
    model.model.goal_dim = goal_dim
    model.model.obs_dim = obs_dim
    model.model.noise_mag = noise_mag
    model.model.n_object = n_object

    count1 = 0
    count2 = 0
    fail1 = [0, 0]
Example #24
0
def _make_flat(*args, **kargs):
    if "FlattenDictWrapper" in dir():
        return FlattenDictWrapper(*args, **kargs)
    return FlattenObservation(FilterObservation(*args, **kargs))
Example #25
0
def make_env(delta=False):
    env = FetchEnvBasic(delta)
    env = FlattenDictWrapper(env, ['observation'])
    return env