Beispiel #1
0
    def __init__(self):
        self.mean = 0
        self.std = 1
        self.dims = 52
        self.lb = -1 * np.ones(self.dims)
        self.ub = 1 * np.ones(self.dims)
        self.counter = 0
        self.env = FlattenObservation(
            FilterObservation(gym.make('FetchReach-v1'),
                              ['observation', 'desired_goal']))
        self.num_rollouts = 3
        self.render = False
        self.policy_shape = (4, 13)

        #tunable hyper-parameters in LA-MCTS
        self.Cp = 10
        self.leaf_size = 100
        self.kernel_type = "linear"
        self.gamma_type = "auto"
        self.ninits = 30

        print("===========initialization===========")
        print("mean:", self.mean)
        print("std:", self.std)
        print("dims:", self.dims)
        print("policy:", self.policy_shape)
Beispiel #2
0
def make_env(env_id, env_type, mpi_rank=0, subrank=0, seed=None, reward_scale=1.0, gamestate=None, flatten_dict_observations=True, wrapper_kwargs=None, logger_dir=None):
    wrapper_kwargs = wrapper_kwargs or {}
    if env_type == 'atari':
        env = make_atari(env_id)
    elif env_type == 'retro':
        import retro
        gamestate = gamestate or retro.State.DEFAULT
        env = retro_wrappers.make_retro(game=env_id, max_episode_steps=10000, use_restricted_actions=retro.Actions.DISCRETE, state=gamestate)
    else:
        env = gym.make(env_id)

    if flatten_dict_observations and isinstance(env.observation_space, gym.spaces.Dict):
        keys = env.observation_space.spaces.keys()
        env = FlattenObservation(env, dict_keys=list(keys))

    env.seed(seed + subrank if seed is not None else None)
    env = Monitor(env,
                  logger_dir and os.path.join(logger_dir, str(mpi_rank) + '.' + str(subrank)),
                  allow_early_resets=True)

    if env_type == 'atari':
        env = wrap_deepmind(env, **wrapper_kwargs)
    elif env_type == 'retro':
        if 'frame_stack' not in wrapper_kwargs:
            wrapper_kwargs['frame_stack'] = 1
        env = retro_wrappers.wrap_deepmind_retro(env, **wrapper_kwargs)

    if reward_scale != 1:
        env = retro_wrappers.RewardScaler(env, reward_scale)

    return env
Beispiel #3
0
class Reacher:
    def __init__(self):
        self.mean = 0
        self.std = 1
        self.dims = 52
        self.lb = -1 * np.ones(self.dims)
        self.ub = 1 * np.ones(self.dims)
        self.counter = 0
        self.env = FlattenObservation(
            FilterObservation(gym.make('FetchReach-v1'),
                              ['observation', 'desired_goal']))
        self.num_rollouts = 3
        self.render = False
        self.policy_shape = (4, 13)

        #tunable hyper-parameters in LA-MCTS
        self.Cp = 10
        self.leaf_size = 100
        self.kernel_type = "linear"
        self.gamma_type = "auto"
        self.ninits = 30

        print("===========initialization===========")
        print("mean:", self.mean)
        print("std:", self.std)
        print("dims:", self.dims)
        print("policy:", self.policy_shape)

    def __call__(self, x):
        self.counter += 1
        assert len(x) == self.dims
        assert x.ndim == 1
        assert np.all(x <= self.ub) and np.all(x >= self.lb)

        M = x.reshape(self.policy_shape)

        returns = []
        observations = []
        actions = []

        for i in range(self.num_rollouts):
            obs = self.env.reset()
            done = False
            totalr = 0.
            steps = 0
            while not done:
                # M      = self.policy
                inputs = (obs - self.mean) / self.std
                action = np.dot(M, inputs)
                observations.append(obs)
                actions.append(action)
                obs, r, done, _ = self.env.step(action)
                totalr += r
                steps += 1
                if self.render:
                    self.env.render()
            returns.append(totalr)

        return np.mean(returns) * -1
Beispiel #4
0
def test_flatten_observation(env_id):
    env = gym.make(env_id)
    wrapped_env = FlattenObservation(env)

    obs = env.reset()
    wrapped_obs = wrapped_env.reset()

    assert len(obs.shape) == 3
    assert len(wrapped_obs.shape) == 1
    assert wrapped_obs.shape[0] == obs.shape[0] * obs.shape[1] * obs.shape[2]
Beispiel #5
0
def make_robotics_env(env_id, seed, rank=0):
    """
    Create a wrapped, monitored gym.Env for MuJoCo.
    """
    set_global_seeds(seed)
    env = gym.make(env_id)
    env = FlattenObservation(FilterObservation(env, ['observation', 'desired_goal']))
    env = Monitor(
        env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)),
        info_keywords=('is_success',))
    env.seed(seed)
    return env
def test_flatten_observation(env_id):
    env = gym.make(env_id)
    wrapped_env = FlattenObservation(env)

    obs = env.reset()
    wrapped_obs = wrapped_env.reset()

    space = spaces.Tuple(
        (spaces.Discrete(32), spaces.Discrete(11), spaces.Discrete(2)))
    wrapped_space = spaces.Box(0, 1, [32 + 11 + 2], dtype=np.int64)

    assert space.contains(obs)
    assert wrapped_space.contains(wrapped_obs)
Beispiel #7
0
    def test_flattened_environment(self, observation_space, ordered_values):
        """
        make sure that flattened observations occur in the order expected
        """
        env = FakeEnvironment(observation_space=observation_space)
        wrapped_env = FlattenObservation(env)
        flattened = wrapped_env.reset()

        unflattened = unflatten(env.observation_space, flattened)
        original = env.observation

        self._check_observations(original, flattened, unflattened,
                                 ordered_values)
Beispiel #8
0
def make_env(with_monitor=False,folder_name='results'):
    env = gym.make("FetchReach-v1")
    env.env.reward_type = 'dense'
    env = FlattenObservation(FilterObservation(env, ['observation', 'desired_goal']))
    if with_monitor:
        env = gym.wrappers.Monitor(env, folder_name, force=True)
    return env 
Beispiel #9
0
def make_env(env_id,
             env_type,
             args,
             mpi_rank=0,
             subrank=0,
             seed=None,
             reward_scale=1.0,
             gamestate=None,
             flatten_dict_observations=True,
             wrapper_kwargs=None,
             env_kwargs=None,
             logger_dir=None,
             initializer=None):
    if initializer is not None:
        initializer(mpi_rank=mpi_rank, subrank=subrank)

    wrapper_kwargs = wrapper_kwargs or {}
    env_kwargs = env_kwargs or {}
    if ':' in env_id:
        import re
        import importlib
        module_name = re.sub(':.*', '', env_id)
        env_id = re.sub('.*:', '', env_id)
        importlib.import_module(module_name)
    env = gym.make(env_id, **env_kwargs)

    # Adding RM wrappers if needed
    if args.alg.endswith("hrm") or args.alg.endswith("dhrm"):
        env = HierarchicalRMWrapper(env, args.r_min, args.r_max,
                                    args.use_self_loops, args.use_rs,
                                    args.gamma, args.rs_gamma)

    if args.use_rs or args.use_crm:
        env = RewardMachineWrapper(env, args.use_crm, args.use_rs, args.gamma,
                                   args.rs_gamma)

    if flatten_dict_observations and isinstance(env.observation_space,
                                                gym.spaces.Dict):
        env = FlattenObservation(env)

    env.seed(seed + subrank if seed is not None else None)
    env = Monitor(env,
                  logger_dir
                  and os.path.join(logger_dir,
                                   str(mpi_rank) + '.' + str(subrank)),
                  allow_early_resets=True)

    if isinstance(env.action_space, gym.spaces.Box):
        env = ClipActionsWrapper(env)

    if reward_scale != 1:
        env = retro_wrappers.RewardScaler(env, reward_scale)

    return env
Beispiel #10
0
    def test_nested_dicts_size(self, observation_space, flat_shape):
        env = FakeEnvironment(observation_space=observation_space)

        # Make sure we are testing the right environment for the test.
        observation_space = env.observation_space
        assert isinstance(observation_space, Dict)

        wrapped_env = FlattenObservation(FilterObservation(env, env.obs_keys))
        assert wrapped_env.observation_space.shape == flat_shape

        assert wrapped_env.observation_space.dtype == np.float32
def make_env(env_id, env_type, mpi_rank=0, subrank=0, seed=None, reward_scale=1.0, gamestate=None, 
            flatten_dict_observations=True, wrapper_kwargs=None, env_kwargs=None, logger_dir=None, initializer=None):
    if initializer is not None:
        initializer(mpi_rank=mpi_rank, subrank=subrank)

    wrapper_kwargs = wrapper_kwargs or {}
    env_kwargs = env_kwargs or {}
    if ':' in env_id:
        import importlib
        import re
        module_name = re.sub(':.*','',env_id)
        env_id = re.sub('.*:', '', env_id)
        importlib.import_module(module_name)

    env = gym.make(env_id, **env_kwargs)
    # if env_id.startswith('Sawyer'):
    #     from mher.algos.multi_world_wrapper import SawyerGoalWrapper
    #     env = SawyerGoalWrapper(env)
    # if (env_id.startswith('Sawyer') or env_id.startswith('Point2D')) and not hasattr(env, '_max_episode_steps'):
    #     env = gym.wrappers.TimeLimit(env, max_episode_steps=100)

    if flatten_dict_observations and isinstance(env.observation_space, gym.spaces.Dict):
        env = FlattenObservation(env)

    env.seed(seed + subrank if seed is not None else None)
    env = Monitor(env,
                  logger_dir and os.path.join(logger_dir, str(mpi_rank) + '.' + str(subrank)),
                  allow_early_resets=True)

    if isinstance(env.action_space, gym.spaces.Box):
        env = ClipActionsWrapper(env)

    if reward_scale != 1:
        env = retro_wrappers.RewardScaler(env, reward_scale)
    return env
Beispiel #12
0
def make_robotics_env(env_id, seed, rank=0, allow_early_resets=True):
    """
    Create a wrapped, monitored gym.Env for MuJoCo.

    :param env_id: (str) the environment ID
    :param seed: (int) the inital seed for RNG
    :param rank: (int) the rank of the environment (for logging)
    :param allow_early_resets: (bool) allows early reset of the environment
    :return: (Gym Environment) The robotic environment
    """
    set_global_seeds(seed)
    env = gym.make(env_id)
    keys = ['observation', 'desired_goal']
    # TODO: remove try-except once most users are running modern Gym
    try:  # for modern Gym (>=0.15.4)
        from gym.wrappers import FilterObservation, FlattenObservation
        env = FlattenObservation(FilterObservation(env, keys))
    except ImportError:  # for older gym (<=0.15.3)
        from gym.wrappers import FlattenDictWrapper  # pytype:disable=import-error
        env = FlattenDictWrapper(env, keys)
    env = Monitor(env,
                  logger.get_dir()
                  and os.path.join(logger.get_dir(), str(rank)),
                  info_keywords=('is_success', ),
                  allow_early_resets=allow_early_resets)
    env.seed(seed)
    return env
Beispiel #13
0
def setup_wrappers(env):
    obs_shape = env.observation_space.shape
    is_image = len(obs_shape) == 3
    if is_image:
        from gym.wrappers import GrayScaleObservation
        from gym.wrappers import FlattenObservation
        from gym.wrappers import ResizeObservation

        env = GrayScaleObservation(env)
        # env = ResizeObservation(env, (obs_shape[0]//3, obs_shape[0]//3))
        env = FlattenObservation(env)

    return env
Beispiel #14
0
def create_goal_gym_env(**kwargs):
    frames = kwargs.pop('frames', 1)
    name = kwargs.pop('name')
    limit_steps = kwargs.pop('limit_steps', False)

    env = gym.make(name, **kwargs)
    env = FlattenObservation(
        FilterObservation(env, ['observation', 'desired_goal']))

    if frames > 1:
        env = wrappers.FrameStack(env, frames, False)
    if limit_steps:
        env = wrappers.LimitStepsWrapper(env)
    return env
def test_flatten_observation(env_id):
    env = gym.make(env_id)
    wrapped_env = FlattenObservation(env)

    obs = env.reset()
    wrapped_obs = wrapped_env.reset()

    if env_id == 'Blackjack-v0':
        space = spaces.Tuple(
            (spaces.Discrete(32), spaces.Discrete(11), spaces.Discrete(2)))
        wrapped_space = spaces.Box(-np.inf,
                                   np.inf, [32 + 11 + 2],
                                   dtype=np.float32)
    elif env_id == 'KellyCoinflip-v0':
        space = spaces.Tuple(
            (spaces.Box(0, 250.0, [1],
                        dtype=np.float32), spaces.Discrete(300 + 1)))
        wrapped_space = spaces.Box(-np.inf,
                                   np.inf, [1 + (300 + 1)],
                                   dtype=np.float32)

    assert space.contains(obs)
    assert wrapped_space.contains(wrapped_obs)
 def env_fn():
     env = gym.make(env_name)
     if constraint != None:
         if use_aug:
             augmentation_type = 'constraint_state_concat'
         else:
             augmentation_type = 'None'
         use_dense = dense_coeff > 0.
         env = ConstraintEnv(
             env,
             [get_constraint(constraint)(False, use_dense, dense_coeff)],
             augmentation_type=augmentation_type,
             log_dir='../tests/' + exp_name)
     fcenv = FlattenObservation(env)
     return fcenv
Beispiel #17
0
    def _thunk():
        env = gym.make(env_id)
        if env_id.find('Fetch') == -1:
            env = FlattenObservation(env)
        else:
            env = FlattenDictWrapper(env, ['achieved_goal', 'desired_goal'])
        env = RandomizedEnvWrapper(env, seed + rank)

        env.seed(seed + rank)

        return env
Beispiel #18
0
def train(env, type, timesteps):
    env.reset()
    print(check_env(env))
    env = FlattenObservation(env)
    print(env.reward_range)
    print(env.action_space)
    if type == "DQN":
        model = DQN('MlpPolicy',
                    exploration_fraction=0.999,
                    env=env,
                    verbose=1)
    elif type == "A2C":
        model = A2C('MlpPolicy', env=env, verbose=1)
    elif type == "PPO":
        model = PPO('MlpPolicy', env=env, verbose=1)

    model.learn(total_timesteps=timesteps)
    model.save("model_cups")
Beispiel #19
0
        def make_env():

            # wrapped_env -> flatten_observation -> monitor -> clip_action -> scale_reward

            env = make_wrapped_env()
            env.seed(seed + subrank if seed is not None else None)

            if flatten_dict_observations:# and isinstance(env.observation_space, gym.spaces.Dict):
                env = FlattenObservation(env)

            env = Monitor(env,
                          osp.join(monitor_log_dir, str(mpi_rank) + '.' + str(subrank)),  # training and eval write to same file?
                          allow_early_resets=True)
            env = ClipActionsWrapper(env)

            if reward_scale != 1:
                env = retro_wrappers.RewardScaler(env, reward_scale)

            return env
Beispiel #20
0
def main():
    as_gdads = True
    name = "pointmass"
    drop_abs_position = True

    dads_env_fn = envs_fns[name]
    conf: Conf = CONFS[name]

    dict_env = as_dict_env(dads_env_fn())
    dict_env = TimeLimit(dict_env, max_episode_steps=conf.ep_len)
    if drop_abs_position:
        dict_env = DropGoalEnvsAbsoluteLocation(dict_env)
    if as_gdads:
        flat_env = SkillWrapper(env=dict_env, skill_reset_steps=conf.ep_len // 2)
    else:
        flat_obs_content = ["observation", "desired_goal", "achieved_goal"]
        if drop_abs_position:
            flat_obs_content.remove("achieved_goal")  # Because always 0 vector
        flat_env = FlattenObservation(FilterObservation(dict_env, filter_keys=flat_obs_content))

    flat_env = TransformReward(flat_env, f=lambda r: r*conf.reward_scaling)
    flat_env = Monitor(flat_env)

    filename = f"modelsCommandSkills/{name}-gdads{as_gdads}"
    if os.path.exists(filename + ".zip"):
        sac = SAC.load(filename, env=flat_env)
        if as_gdads:
            flat_env.load(filename)
    else:
        sac = SAC("MlpPolicy", env=flat_env, verbose=1, learning_rate=conf.lr,
                  tensorboard_log=f"{filename}-tb", buffer_size=10000)
        train(model=sac, conf=conf, save_fname=filename)
        if as_gdads:
            flat_env.save(filename)

    if as_gdads:
        flat_env.set_sac(sac)
        eval_dict_env(dict_env=dict_env,
                      model=flat_env,
                      ep_len=conf.ep_len)
    show(model=sac, env=flat_env, conf=conf)
Beispiel #21
0
def make_env(env_id, env_type, mpi_rank=0, subrank=0, seed=None, reward_scale=1.0, gamestate=None, flatten_dict_observations=True, wrapper_kwargs=None, env_kwargs=None, logger_dir=None, initializer=None):
    if initializer is not None:
        initializer(mpi_rank=mpi_rank, subrank=subrank)

    wrapper_kwargs = wrapper_kwargs or {}
    env_kwargs = env_kwargs or {}
    if ':' in env_id:
        import re
        import importlib
        module_name = re.sub(':.*','',env_id)
        env_id = re.sub('.*:', '', env_id)
        importlib.import_module(module_name)
    if env_type == 'atari':
        env = make_atari(env_id)
    elif env_type == 'retro':
        import retro
        gamestate = gamestate or retro.State.DEFAULT
        env = retro_wrappers.make_retro(game=env_id, max_episode_steps=10000, use_restricted_actions=retro.Actions.DISCRETE, state=gamestate)
    else:
        env = gym.make(env_id, **env_kwargs)

    if flatten_dict_observations and isinstance(env.observation_space, gym.spaces.Dict):
        env = FlattenObservation(env)

    env.seed(seed + subrank if seed is not None else None)
    env = Monitor(env,
                  logger_dir and os.path.join(logger_dir, str(mpi_rank) + '.' + str(subrank)),
                  allow_early_resets=True)


    if env_type == 'atari':
        env = wrap_deepmind(env, **wrapper_kwargs)
    elif env_type == 'retro':
        if 'frame_stack' not in wrapper_kwargs:
            wrapper_kwargs['frame_stack'] = 1
        env = retro_wrappers.wrap_deepmind_retro(env, **wrapper_kwargs)

    if isinstance(env.action_space, gym.spaces.Box):
        env = ClipActionsWrapper(env)

    if reward_scale != 1:
        env = retro_wrappers.RewardScaler(env, reward_scale)

    return env
Beispiel #22
0
def make_robotics_env(env_id, seed, rank=0):
    """
    Create a wrapped, monitored gym.Env for MuJoCo.
    """
    set_global_seeds(seed)
    env = gym.make(env_id)
    #env = FlattenDictWrapper(env, ['observation', 'desired_goal'])
    keys = ['observation', 'desired_goal']
    # TODO: remove try-except once most users are running modern Gym
    try:  # for modern Gym (>=0.15.4)
        from gym.wrappers import FilterObservation, FlattenObservation
        env = FlattenObservation(FilterObservation(env, keys))
    except ImportError:  # for older gym (<=0.15.3)
        from gym.wrappers import FlattenDictWrapper  # pytype:disable=import-error
        env = FlattenDictWrapper(env, keys)
    env = Monitor(env,
                  logger.get_dir()
                  and os.path.join(logger.get_dir(), str(rank)),
                  info_keywords=('is_success', ))
    env.seed(seed)
    return env
Beispiel #23
0
def make_env(env_id,
             env_type,
             mpi_rank=0,
             subrank=0,
             seed=None,
             reward_scale=1.0,
             gamestate=None,
             flatten_dict_observations=True,
             wrapper_kwargs=None,
             env_kwargs=None,
             logger_dir=None,
             initializer=None):
    """
    Make environment

    Args:
        env_id: (str) environment id e.g. 'Reacher-v2'
        env_type: (str) environment type e.g. 'atari'
        mpi_rank: (int) rank for mpi; default=0 (disabled on windows for lack of MPI support from pytorch)
        subrank: (int) subrank; default=0 (disabled on windows for lack of MPI support from pytorch)
        seed: (int) random seed
        reward_scale: (float) scale factor for reward (== discount factor??); default=1.0
        gamestate: (??) game state to load (for retro games only)
        flatten_dict_observations: (??) ??
        wrapper_kwargs: (dict) dictionary of parameter settings for wrapper
        env_kwargs: (dict) dictionary of parameter settings for environment
        logger_dir: (str) logger path
        initializer: (??) ??

    Returns:
        env: (Env) the set-up environment
    """
    if initializer is not None:
        initializer(mpi_rank=mpi_rank, subrank=subrank)

    wrapper_kwargs = wrapper_kwargs or {}
    env_kwargs = env_kwargs or {}

    if ':' in env_id:
        raise ValueError(
            "env_id {} does not conform to accepted format!".format(env_id))

    if env_type == 'atari':
        # make atari environments with a wrapper function
        env = make_atari(env_id)
    elif env_type == 'retro':
        raise ValueError("retro environments not supported yet!")
    else:
        # make a gym environment with parameter settings
        env = gym.make(env_id, **env_kwargs)

    # flatten the observation space
    if flatten_dict_observations and isinstance(env.observation_spaces,
                                                gym.spaces.Dict):
        env = FlattenObservation(env)

    # add seed to env
    env.seed(seed + subrank if seed is not None else None)

    # set up Monitor (TBD)

    if env_type == 'atari':
        env = wrap_deepmind(env, **wrapper_kwargs)
    elif env_type == 'retro':
        if 'frame_stack' not in wrapper_kwargs:
            wrapper_kwargs['frame_stack'] = 1
        # wrap retro games
        env = wrappers_retro.wrap_deepmind_retro(env, **wrapper_kwargs)

    if isinstance(env.action_space, gym.spaces.Box):
        # if action_space is Box type, clip the action values to be within the box's boundaries
        env = wrappers.ClipActionsWrapper(env)

    if reward_scale != 1:
        # if reward scaling factor is used, scale the rewards accordingly
        # very important feature for PPO
        env = wrappers.RewardScalerWrapper(env, reward_scale)

    return env
Beispiel #24
0
 def test_nested_dicts_ravel(self, observation_space, flat_shape):
     env = FakeEnvironment(observation_space=observation_space)
     wrapped_env = FlattenObservation(FilterObservation(env, env.obs_keys))
     obs = wrapped_env.reset()
     assert obs.shape == wrapped_env.observation_space.shape
Beispiel #25
0
        solver_kwargs={},

        # Define and parameterize the reference generator for the current reference
        reference_generator=WienerProcessReferenceGenerator(
            reference_state='i', sigma_range=(3e-3, 3e-2)),

        # Defines which variables to plot via the builtin dashboard monitor
        visualization=MotorDashboard(state_plots=['i', 'omega']),
    )

    # Now, the environment will output states and references separately
    state, ref = env.reset()

    # For data processing we sometimes want to flatten the env output,
    # which means that the env will only output one array that contains states and references consecutively
    env = FlattenObservation(env)
    obs = env.reset()

    # Read the number of possible actions for the given env
    # this allows us to define a proper learning agent for this task
    nb_actions = env.action_space.n

    window_length = 1

    # Define an artificial neural network to be used within the agent
    model = Sequential()
    # The network's input fits the observation space of the env
    model.add(
        Flatten(input_shape=(window_length, ) + env.observation_space.shape))
    model.add(Dense(16, activation='relu'))
    model.add(Dense(16, activation='relu'))
Beispiel #26
0
        visualization=MotorDashboard(visu_period=0.5,
                                     plotted_variables=['omega', 'i', 'u']),
        converter='Disc-4QC',
        # Take standard class and pass parameters (Load)
        a=0,
        b=.1,
        c=1.1,
        j_load=0.4,
        # Pass a string (with extra parameters)
        ode_solver='euler',
        solver_kwargs={},
        # Pass a Class with extra parameters
        reference_generator=WienerProcessReferenceGenerator(
            reference_state='i', sigma_range=(5e-3, 5e-1)))
    nb_actions = env.action_space.n
    env = FlattenObservation(env)
    model = Sequential()
    model.add(Flatten(input_shape=(1, ) + env.observation_space.shape))
    model.add(Dense(4))
    model.add(LeakyReLU(alpha=0.05))
    model.add(Dense(4))
    model.add(LeakyReLU(alpha=0.05))
    model.add(Dense(nb_actions))
    model.add(Activation('linear'))

    memory = SequentialMemory(limit=15000, window_length=1)
    policy = LinearAnnealedPolicy(EpsGreedyQPolicy(eps=0.5), 'eps', 0.5, 0.01,
                                  0, 20000)
    dqn = DQNAgent(model=model,
                   policy=policy,
                   nb_actions=nb_actions,
Beispiel #27
0
def env_fn():
    env = gym.make(ENV)
    print(env.observation_space)
    env = FlattenObservation(env)
    return env
Beispiel #28
0
import multiprocessing
import neat
import numpy as np
import os
import pickle
import random
import time
from gym.wrappers import FlattenObservation, FilterObservation
import visualize

NUM_CORES = 1

env = gym.make('FetchReach-v1')
env.env.reward_type = 'dense'
env = FlattenObservation(FilterObservation(env, ['observation', 'desired_goal']))

print("action space: {0!r}".format(env.action_space))
print("observation space: {0!r}".format(env.observation_space))

env = gym.wrappers.Monitor(env, 'results', force=True)

class RoboGenome(neat.DefaultGenome):
    def __init__(self, key):
        super().__init__(key)
        self.discount = None

    def configure_new(self, config):
        super().configure_new(config)
        self.discount = 0.01 + 0.98 * random.random()
Beispiel #29
0
def load(environment_name,
         env_id=None,
         concat_desired_goal=True,
         discount=1.0,
         max_episode_steps=None,
         sparse_reward=False,
         use_success_wrapper=True,
         gym_env_wrappers=(),
         alf_env_wrappers=(),
         wrap_with_process=False):
    """Loads the selected environment and wraps it with the specified wrappers.

    Note that by default a ``TimeLimit`` wrapper is used to limit episode lengths
    to the default benchmarks defined by the registered environments.

    Args:
        environment_name: Name for the environment to load.
        env_id: A scalar ``Tensor`` of the environment ID of the time step.
        discount: Discount to use for the environment.
        max_episode_steps: If None the ``max_episode_steps`` will be set to the default
            step limit defined in the environment's spec. No limit is applied if set
            to 0 or if there is no ``timestep_limit`` set in the environment's spec.
        sparse_reward (bool): If True, the game ends once the goal is achieved.
            Rewards will be added by 1, changed from -1/0 to 0/1.
        use_success_wrapper (bool): If True, wraps the environment with the
            SuccessWrapper which will record Success info after a specified
            amount of timesteps.
        gym_env_wrappers: Iterable with references to wrapper classes to use
            directly on the gym environment.
        alf_env_wrappers: Iterable with references to wrapper classes to use on
            the torch environment.

    Returns:
        An AlfEnvironment instance.
    """
    assert (environment_name.startswith("Fetch")
            or environment_name.startswith("HandManipulate")), (
                "This suite only supports OpenAI's Fetch and ShadowHand envs!")

    _unwrapped_env_checker_.check_and_update(wrap_with_process)

    gym_spec = gym.spec(environment_name)
    env = gym_spec.make()

    if max_episode_steps is None:
        if gym_spec.max_episode_steps is not None:
            max_episode_steps = gym_spec.max_episode_steps
        else:
            max_episode_steps = 0

    def env_ctor(env_id=None):
        return suite_gym.wrap_env(
            env,
            env_id=env_id,
            discount=discount,
            max_episode_steps=max_episode_steps,
            gym_env_wrappers=gym_env_wrappers,
            alf_env_wrappers=alf_env_wrappers,
            image_channel_first=False)

    # concat robot's observation and the goal location
    if concat_desired_goal:
        keys = ["observation", "desired_goal"]
        try:  # for modern Gym (>=0.15.4)
            from gym.wrappers import FilterObservation, FlattenObservation
            env = FlattenObservation(FilterObservation(env, keys))
        except ImportError:  # for older gym (<=0.15.3)
            from gym.wrappers import FlattenDictWrapper  # pytype:disable=import-error
            env = FlattenDictWrapper(env, keys)
    if use_success_wrapper:
        env = SuccessWrapper(env, max_episode_steps)
    env = ObservationClipWrapper(env)
    if sparse_reward:
        env = SparseReward(env)

    if wrap_with_process:
        process_env = process_environment.ProcessEnvironment(
            functools.partial(env_ctor))
        process_env.start()
        torch_env = alf_wrappers.AlfEnvironmentBaseWrapper(process_env)
    else:
        torch_env = env_ctor(env_id=env_id)

    return torch_env
Beispiel #30
0
    def train(self):
        """Method for training the Network"""

        for epoch in range(self.n_epochs):
            for episode in range(self.n_episodes):

                done = False
                score = 0

                episode_experience = []

                # Reset the environment to it's initial state
                observation = self.env.reset()
                observation = FlattenObservation(
                    FilterObservation(
                        observation,
                        ['observation', 'achieved_goal', 'desired_goal']))

                # Because we are not working with a continous action space,
                # we are limiting ourselfs to a finite number of timesteps
                # per episode, other wise the below for loop would be replaced
                # with `while not done:`

                for _ in range(self.n_time_steps):

                    self.env.render()
                    action = self.act(observation['observation'])
                    print(action)
                    new_observation, reward, done, info = self.env.step(action)

                    score += reward

                    episode_experience.append(
                        (observation['observation'], action, reward,
                         new_observation['observation'], done))

                    self.save(np.asarray(observation['observation']), action,
                              reward, new_observation['observation'], done)

                    observation = new_observation
                    self.learn()

                    # break if we finish the environment
                    if done is True:
                        break

                # HER Algorithm
                for t in range(len(episode_experience)):
                    for _ in range(self.K):
                        future = np.random.randint(t, len(episode_experience))
                        goal = episode_experience[future][3]
                        state = episode_experience[t][0]
                        action = episode_experience[t][1]
                        next_state = episode_experience[t][3]
                        done = np.array_equal(next_state, goal)
                        reward = 0 if done else -1

                        self.save(state, action, reward, next_state, done)

            # save model every 5 epochs
            # this is an arbitrary number and will change
            if epoch % 10 == 0 and epoch > 0:
                self.save_model