def __init__(self,
              name,
              globalAC,
              hard_share=None,
              soft_sharing_coeff_actor=0.0,
              soft_sharing_coeff_critic=0.0,
              gradient_clip_actor=0.0,
              gradient_clip_critic=0.0,
              debug=False,
              max_ep_steps=200,
              image_shape=None,
              stack=1):
     self.env = gym.make(GAME).unwrapped
     self.env = TimeLimit(self.env, max_episode_steps=max_ep_steps)
     self.name = name
     self.AC = ACNet(name,
                     globalAC,
                     hard_share=hard_share,
                     soft_sharing_coeff_actor=soft_sharing_coeff_actor,
                     soft_sharing_coeff_critic=soft_sharing_coeff_critic,
                     gradient_clip_actor=gradient_clip_actor,
                     gradient_clip_critic=gradient_clip_critic,
                     image_shape=image_shape,
                     stack=stack)
     self.debug = debug
     self.image_shape = image_shape
     self.stack = stack
Ejemplo n.º 2
0
def reacher(n_links=2):
    if n_links == 2:
        return TimeLimit(Reacher2Link(),
                         max_episode_steps=50,
                         max_episode_seconds=None)
    elif n_links == 3:
        return TimeLimit(Reacher3Link(),
                         max_episode_steps=50,
                         max_episode_seconds=None)
Ejemplo n.º 3
0
    def evaluate(self):
        test_env = TimeLimit(gym.make('PepperPush-v0'), max_episode_steps=100)
        obs = test_env.reset()
        results = evaluate_policy(self.model,
                                  test_env,
                                  n_eval_episodes=75,
                                  return_episode_rewards=False)

        return results[0]
Ejemplo n.º 4
0
def run_episode(env: TimeLimit, estimator: CNN_DQN):
    obs = env.reset()
    state = get_state(obs)
    is_done = False
    total_reward = 0

    while not is_done:
        actionIndex = torch.argmax(estimator.predict(state)).item()
        action = ACTIONS[actionIndex]
        obs, reward, is_done, info = env.step(action)
        state = get_state(obs)
        total_reward += reward

    return total_reward
Ejemplo n.º 5
0
def run_discrete(environment_name, mapping, shape=None, problem=None):
    if problem is None:
        problem = gym.make(environment_name)
    else:
        from gym.wrappers.time_limit import TimeLimit
        problem = TimeLimit(problem)

    print('== {} =='.format(environment_name))
    print('Actions:', problem.env.action_space.n)
    print('States:', problem.env.observation_space.n)
    print(problem.env.desc)
    print()

    print('== Value Iteration ==')
    value_policy, iters = value_iteration(problem)
    print('Iterations:', iters)
    print()

    print('== Policy Iteration ==')
    policy, iters = policy_iteration(problem)
    print('Iterations:', iters)
    print()

    diff = sum(
        [abs(x - y) for x, y in zip(policy.flatten(), value_policy.flatten())])
    if diff > 0:
        print('Discrepancy:', diff)
        print()

    if shape is not None:
        print('== Policy ==')
        print_policy(policy, mapping, shape)
        print()

    return policy
Ejemplo n.º 6
0
def get_agent_and_runner(max_timesteps=EPISODE_MAX_LENGTH):
    max_timesteps = EPISODE_MAX_LENGTH if max_timesteps is None else max_timesteps
    # OpenAI-Gym environment specification
    gym_environment = gym.make(LEVEL, render=True)
    gym_environment = TimeLimit(gym_environment.unwrapped,
                                max_episode_steps=max_timesteps)
    # gym_environment = Monitor(gym_environment, RECORD_DICT, force=True)

    environment = Environment.create(
        environment=gym_environment,
        max_episode_timesteps=gym_environment.spec.max_episode_steps,
    )

    agent = Agent.create(
        agent='a2c',
        environment=environment,
        # parallel_interactions=PARALLEL,
        # Automatically configured network
        # network='auto',
        network=[
            dict(type='dense', size=256, activation='tanh'),
            dict(type='dense', size=256, activation='tanh'),
            dict(type='dense', size=256, activation='tanh'),
        ],
        # AC optimization parameters
        batch_size=256,
        update_frequency=2,
        learning_rate=0.001,
        # Reward estimation
        discount=0.99,
        predict_terminal_values=False,
        # Regularization
        l2_regularization=1.0,
        entropy_regularization=0.0,
        # Preprocessing
        state_preprocessing='linear_normalization',
        reward_preprocessing=None,
        # Exploration
        exploration=0.3,
        variable_noise=0.2,
        # Default additional config values
        config=None,
        # Save agent every 10 updates and keep the 5 most recent checkpoints
        saver=dict(directory=MODEL_DICT, frequency=10, max_checkpoints=5),
        # Log all available Tensorboard summaries
        summarizer=dict(directory=SUMMARY_DICT, summaries='all'),
        # Do not record agent-environment interaction trace
        recorder=None  # RECORD_DICT
    )

    # Initialize the runner
    runner = Runner(
        agent=agent,
        environment=environment,
        max_episode_timesteps=gym_environment.spec.max_episode_steps,
        # num_parallel=PARALLEL,
        # remote="multiprocessing"
    )

    return agent, runner
Ejemplo n.º 7
0
def make_env(env_name, max_episode_steps, episode_life=True):
    env = gym.make(env_name)
    env_group_title = get_env_group_title(env)
    # print(env_group_title, env_name)
    if env_group_title == 'gym_minatar':
        env = make_minatar(env, max_episode_steps, scale=False)
        if len(env.observation_space.shape) == 3:
            env = TransposeImage(env)
    elif env_group_title == 'atari' and '-ram' in env_name:
        make_atari_ram(env, max_episode_steps, scale=True)
    elif env_group_title == 'atari':
        env = make_atari(env, max_episode_steps)
        env = ReturnWrapper(env)
        env = wrap_deepmind(env,
                            episode_life=episode_life,
                            clip_rewards=False,
                            frame_stack=False,
                            scale=False)
        if len(env.observation_space.shape) == 3:
            env = TransposeImage(env)
        env = FrameStack(env, 4)
    elif env_group_title in [
            'classic_control', 'box2d', 'gym_pygame', 'gym_exploration',
            'pybullet', 'mujoco', 'robotics'
    ]:
        if max_episode_steps > 0:  # Set max episode steps
            env = TimeLimit(env.unwrapped, max_episode_steps)
    return env
Ejemplo n.º 8
0
    def make(self, **kwargs) -> Env:
        """Instantiates an instance of the environment with appropriate kwargs"""
        if self.entry_point is None:
            raise error.Error(
                f"Attempting to make deprecated env {self.id}. "
                "(HINT: is there a newer registered version of this env?)")
        _kwargs = self.kwargs.copy()
        _kwargs.update(kwargs)

        if callable(self.entry_point):
            env = self.entry_point(**_kwargs)
        else:
            cls = load(self.entry_point)
            env = cls(**_kwargs)

        # Make the environment aware of which spec it came from.
        spec = copy.deepcopy(self)
        spec.kwargs = _kwargs
        env.unwrapped.spec = spec
        if self.order_enforce:
            from gym.wrappers.order_enforcing import OrderEnforcing

            env = OrderEnforcing(env)
        assert env.spec is not None, "expected spec to be set to the unwrapped env."
        if env.spec.max_episode_steps is not None:
            from gym.wrappers.time_limit import TimeLimit

            env = TimeLimit(env, max_episode_steps=env.spec.max_episode_steps)
        return env
    def meta_reset(self, seed):
        np.random.seed(seed)

        env = RandomWeightHopperEnv(rand_mass=self.rand_mass,
                                    rand_gravity=self.rand_gravity,
                                    rand_friction=self.rand_friction,
                                    rand_thickness=self.rand_thickness)

        # Based on Hopper-v2
        spec = EnvSpec(
            'RandomWeightHopperEnv-v0',
            entry_point='generic_rl.envs.mujoco:RandomWeightHopperEnv',
            max_episode_steps=1000,
            reward_threshold=3800.0
        )

        env._spec = spec
        env.seed(seed)

        # Wrap the env as needed
        env = TimeLimit(
            env,
            max_episode_steps=spec.max_episode_steps,
            max_episode_seconds=spec.max_episode_seconds
        )

        self.env = env
        # Fix for done flags.
        self.env.reset()
        self.step = env.step
        self.render = env.render
        self.reset = env.reset
    def meta_reset(self, seed):
        np.random.seed(seed)
        env = NormalHopperEnv()

        # Based on Hopper-v2
        spec = EnvSpec(
            'NormalHopperEnv-v0',
            entry_point='generic_rl.envs.mujoco:NormalHopperEnv',
            max_episode_steps=1000,
            reward_threshold=3800.0
        )

        env._spec = spec
        env.seed(seed)

        # Wrap the env as needed
        env = TimeLimit(
            env,
            max_episode_steps=spec.max_episode_steps,
            max_episode_seconds=spec.max_episode_seconds
        )

        self.env = env
        # Fix for done flags.
        self.env.reset()
        self.step = env.step
        self.render = env.render
        self.reset = env.reset
Ejemplo n.º 11
0
def test_remove_time_limit():
    env = gym.make("MsPacmanNoFrameskip-v4")
    env = TransformReward(TimeLimit(AtariPreprocessing(env)), lambda x: x)
    rem_env = remove_time_limit(env)
    assert rem_env.spec.max_episode_steps == int(1e100)
    assert not isinstance(rem_env.env, TimeLimit)
    assert "TimeLimit" not in str(rem_env)
Ejemplo n.º 12
0
    def __init__(self, paramters={}):
        self.paramters = paramters
        self.env = TimeLimit(gym.make('PepperPush-v0'), max_episode_steps=100)

        policy_kwargs = dict(activation_fn=th.nn.ReLU,
                             net_arch=[256, 256, 256])
        if "net_arch" in paramters:
            policy_kwargs["net_arch"] = paramters["net_arch"]

        self.model = HER(
            paramters.get("policy", 'MlpPolicy'),
            self.env,
            SAC,
            online_sampling=paramters.get("online_sampling", False),
            verbose=paramters.get("verbose", 0),
            max_episode_length=paramters.get("max_episode_length", 100),
            buffer_size=paramters.get("buffer_size", 1000000),
            batch_size=paramters.get("batch_size", 256),
            learning_rate=paramters.get("learning_rate", 0.001),
            learning_starts=paramters.get("learning_starts", 500),
            n_sampled_goal=paramters.get("n_sampled_goal", 4),
            gamma=paramters.get("gamma", 0.95),
            goal_selection_strategy=paramters.get("goal_selection_strategy",
                                                  'future'),
            ent_coef=paramters.get("ent_coef", 'auto'),
            policy_kwargs=policy_kwargs,
            train_freq=paramters.get("train_freq", 1),
            tensorboard_log=paramters.get("tensorboard_log",
                                          "./data/0_tensorboard/"))
Ejemplo n.º 13
0
def load_environment(env_name):
  """Outputs a wrapped gym environment."""
  environment = gym.make(env_name)
  environment = TimeLimit(environment, max_episode_steps=1000)
  environment = wrappers.gym_wrapper.GymWrapper(environment)
  environment = wrappers.SinglePrecisionWrapper(environment)
  return environment
Ejemplo n.º 14
0
    def __init__(self, actor_id, game, seed, env_class=None, visualize=False, agent_history_length=1, random_start=False,
                 partially_observed=False):
        try:
            self.env = gym.make(game)
            try:
                self.desc = self.env.unwrapped.desc
            except:
                self.desc = None
        except (NameError, ImportError):
            assert env_class is not None, "The specified environment does not seem to be a registered Gym environment: env_class cannot be None."
            spec = registry.spec(game)
            self.env = env_class(**spec._kwargs)
            self.env.unwrapped._spec = spec
            self.desc = self.env.desc
            self.env = TimeLimit(self.env,
                                 max_episode_steps=self.env.spec.max_episode_steps,
                                 max_episode_seconds=self.env.spec.max_episode_seconds)
        self.env.seed(seed * (actor_id + 1))
        if partially_observed:
            self.env = PartiallyObservedCorridor(self.env)
        else:
            self.env = OneHotObservation(self.env)
        if agent_history_length > 1:
            self.env = ObsStack(self.env, agent_history_length)

        self.agent_history_length = agent_history_length

        self.num_actions = self.env.action_space.n
        self.gym_actions = list(range(self.env.action_space.n))
        self.visualize = visualize

        self.grid_shape = self.desc.shape

        self.game = game
        self.np_random, seed = seeding.np_random(seed)
Ejemplo n.º 15
0
def make_env(envName):
    env = {
        'KukaReach-v1': ReachEnv,
        'KukaPickAndPlaceObstacle-v1': PickObstacleEnv,
        'KukaPickAndPlaceObstacle-v2': PickObstacleEnvV2,
        'KukaPickNoObstacle-v1': PickNoObstacleEnv,
        'KukaPickNoObstacle-v2': PickNoObstacleEnvV2,
        'KukaPickThrow-v1': PickThrowEnv,
        'KukaPushLabyrinth-v1': PushLabyrinthEnv,
        'KukaPushLabyrinth-v2': PushLabyrinthEnvV2,
        'KukaPushSlide-v1': PushSlide,
        'KukaPushNew-v1': PushNewEnv
    }[envName]()
    MAXEPISODESTEPS = {
        'KukaReach-v1': 50,
        'KukaPickAndPlaceObstacle-v1': 100,
        'KukaPickAndPlaceObstacle-v2': 100,
        'KukaPickNoObstacle-v1': 100,
        'KukaPickNoObstacle-v2': 100,
        'KukaPickThrow-v1': 100,
        'KukaPushLabyrinth-v1': 100,
        'KukaPushSlide-v1': 100,
        'KukaPushNew-v1': 200,
        'KukaPushLabyrinth-v2': 100
    }[envName]
    env = TimeLimit(env, max_episode_steps=MAXEPISODESTEPS)
    return env
Ejemplo n.º 16
0
    def create():
        for import_name in imports:
            importlib.import_module(import_name)
        if isinstance(env_type, str):
            assert (not kwargs), "ENV kwargs not supported for gym envs"
            env = gym.make(env_type)
        elif callable(env_type):
            env = env_type(**kwargs)
        else:
            raise ValueError("make_env_creator() expected either a string or "
                             f"callable for 'env_type', got {type(env_type)}")

        # Limit the max steps per episode if requested
        if max_episode_steps is not None:
            if hasattr(env, "_max_episode_steps"):
                # Use the '_max_episode_steps' if available from gym. This is
                # to allow increasing the limit for example in cartpole.
                # (The TimeLimit option can only decrease the limit)
                env._max_episode_steps = max_episode_steps
            else:
                env = TimeLimit(env, max_episode_steps)

        # Always begin with EpisodeTracker so that the training gets the real
        # rewards/dones before any additional wrappers process them
        env = EpisodeTracker(env)

        # Apply all requested wrappers
        for wrapper in wrappers:
            wrapper_type = wrapper.get("type")
            wrapper_args = wrapper.get("args", {})
            env = wrapper_type(env, **wrapper_args)
        return env
Ejemplo n.º 17
0
 def __init__(self, env, **kwargs):
     if (env.spec.timestep_limit
             is not None) and not env.spec.tags.get('vnc'):
         from gym.wrappers.time_limit import TimeLimit
         env = TimeLimit(env,
                         max_episode_steps=env.spec.max_episode_steps,
                         max_episode_seconds=env.spec.max_episode_seconds)
     super(Environment, self).__init__(env)
Ejemplo n.º 18
0
def make_atari(env, max_episode_steps):
    # assert 'NoFrameskip' in env.spec.id
    print('set time limit:', max_episode_steps)
    env = NoopResetEnv(env, noop_max=30)
    env = MaxAndSkipEnv(env, skip=4)
    if max_episode_steps > 0:
        env = TimeLimit(env, max_episode_steps=max_episode_steps)
    return env
Ejemplo n.º 19
0
def Monitor(env=None, directory=None, video_callable=None, force=False, resume=False,
            write_upon_reset=False, uid=None, mode=None):
    if not isinstance(env, gym.Env):
        raise error.Error("Monitor decorator syntax is deprecated as of 12/28/2016. Replace your call to `env = gym.wrappers.Monitor(directory)(env)` with `env = gym.wrappers.Monitor(env, directory)`")

    # TODO: add duration in seconds also
    return _Monitor(TimeLimit(env, max_episode_steps=env.spec.timestep_limit), directory, video_callable, force, resume,
                    write_upon_reset, uid, mode)
Ejemplo n.º 20
0
    def __init__(self, env):
        TimeLimit.__init__(self, env)

        self.observation_scale = np.ones_like(self.observation_space.high)
        self.observation_shift = np.zeros_like(self.observation_space.high)

        self.action_scale = (self.action_space.high - self.action_space.low) / 2
        self.action_shift = (self.action_space.high + self.action_space.low) / 2

        self.rewards_scale = 1e-1
        self.rewards_shift = 0.

        # update observation and action space
        self.observation_space = gym.spaces.Box(self.normalize_observation(self.observation_space.low),
                                                self.normalize_observation(self.observation_space.high))
        self.action_space = gym.spaces.Box(-np.ones_like(self.action_space.high),
                                            np.ones_like(self.action_space.high))
Ejemplo n.º 21
0
    def step(self, action):
        normalized_action = np.clip(self.normalized_action(action), self.action_space.high, self.action_space.low)

        obs, reward, term, info = TimeLimit.step(self, normalized_action)

        normalized_observation = self.normalized_observation(obs)

        return normalized_observation, reward, term, info
Ejemplo n.º 22
0
 def make_policy_env():
     repeats = 3
     pol_env = RestartablePendulumEnv(
         repeats=repeats, pixels=True)  # can specify cost="dm_control"
     pol_env = TimeLimit(pol_env, max_episode_steps=int(
         200 / repeats))  # only run the environment for 200 true steps
     proj = np.eye(rep_model.enc_dim)
     return ew.TorchEncoderWrapper(pol_env, encnet, proj)
def make_atari(env_id, max_episode_steps=None):
    env = gym.make(env_id)
    env.spec.id = 'NoFrameskip'
    assert 'NoFrameskip' in env.spec.id
    env = NoopResetEnv(env, noop_max=30)
    env = MaxAndSkipEnv(env, skip=4)
    if max_episode_steps is not None:
        env = TimeLimit(env, max_episode_steps=max_episode_steps)
    return env
Ejemplo n.º 24
0
def render_episode(env: TimeLimit, estimator: CNN_DQN):
    obs = env.reset()
    state = get_state(obs)
    is_done = False
    while not is_done:
        sleep(0.0415)  # (~24 fps)

        rgb = env.render('rgb_array')
        upscaled = repeat_upsample(rgb, 3, 4)
        viewer.imshow(upscaled)

        actionIndex = torch.argmax(estimator.predict(state)).item()
        action = ACTIONS[actionIndex]
        obs, reward, is_done, _ = env.step(action)
        if reward != 0:
            print(reward)
        state = get_state(obs)
    env.close()
Ejemplo n.º 25
0
 def make(self, id):
     logger.info('Making new env: %s', id)
     spec = self.spec(id)
     env = spec.make()
     if (env.spec.timestep_limit is not None) and not spec.tags.get('vnc'):
         from gym.wrappers.time_limit import TimeLimit
         env = TimeLimit(env,
                         max_episode_steps=env.spec.max_episode_steps,
                         max_episode_seconds=env.spec.max_episode_seconds)
     return env
Ejemplo n.º 26
0
def make_atari_ram(env, max_episode_steps, scale=True, episode_life=True):
    env = NoopResetEnv(env, noop_max=30)
    env = MaxAndSkipEnv(env, skip=4)
    if max_episode_steps > 0:
        env = TimeLimit(env, max_episode_steps=max_episode_steps)
    if scale:
        env = ScaledFloatFrame(env)
    if episode_life:
        env = EpisodicLifeEnv(env)
    return env
Ejemplo n.º 27
0
 def make(self):
     env = ValueWrapper(
         TimeLimit(
             ToyVMEnv(self.slate_size),
             max_episode_steps=self.max_episode_steps,
         ),
         zero_augment,
     )
     if self.initial_seed:
         env.seed(self.initial_seed)
     return env
 def make(self, id):
     logger.info('Making new env: %s', id)
     spec = self.spec(id)
     env = spec.make()
     if hasattr(env, "_reset") and hasattr(env, "_step"):
         patch_deprecated_methods(env)
     if (env.spec.timestep_limit is not None) and not spec.tags.get('vnc'):
         from gym.wrappers.time_limit import TimeLimit
         env = TimeLimit(env,
                         max_episode_steps=env.spec.max_episode_steps,
                         max_episode_seconds=env.spec.max_episode_seconds)
     return env
Ejemplo n.º 29
0
def collect_fixed_set_of_states(conf: dict, env: TimeLimit) -> list:
    # Collect samples to evaluate the agent on a fixed set of samples
    # (DQN paper). Collect a fixed set of states by running a random policy
    # before training starts and track the average of the maximum predicted
    # Q for these states.
    env.reset()
    exclude = conf['preprocess']['exclude']
    fixed_states = []

    while True:
        action = env.action_space.sample()
        next_state, reward, done, _ = env.step(action)
        state = next_state
        preprocessed_state = preprocess_frame(state, exclude)
        fixed_states.append(preprocessed_state)
        if done:
            break
    env.close()
    print(f'Collected {len(fixed_states)} fixed set of states!')

    return fixed_states
Ejemplo n.º 30
0
def load_d4rl(env_name, default_time_limit=1000):
    """Loads the python environment from D4RL."""
    gym_env = gym.make(env_name)
    gym_spec = gym.spec(env_name)

    # Default to env time limit unless it is not specified.
    if gym_spec.max_episode_steps in [0, None]:
        gym_env = TimeLimit(gym_env, max_episode_steps=default_time_limit)

    # Wrap TF-Agents environment.
    env = gym_wrapper.GymWrapper(gym_env)
    return env