def __init__(self, name, globalAC, hard_share=None, soft_sharing_coeff_actor=0.0, soft_sharing_coeff_critic=0.0, gradient_clip_actor=0.0, gradient_clip_critic=0.0, debug=False, max_ep_steps=200, image_shape=None, stack=1): self.env = gym.make(GAME).unwrapped self.env = TimeLimit(self.env, max_episode_steps=max_ep_steps) self.name = name self.AC = ACNet(name, globalAC, hard_share=hard_share, soft_sharing_coeff_actor=soft_sharing_coeff_actor, soft_sharing_coeff_critic=soft_sharing_coeff_critic, gradient_clip_actor=gradient_clip_actor, gradient_clip_critic=gradient_clip_critic, image_shape=image_shape, stack=stack) self.debug = debug self.image_shape = image_shape self.stack = stack
def reacher(n_links=2): if n_links == 2: return TimeLimit(Reacher2Link(), max_episode_steps=50, max_episode_seconds=None) elif n_links == 3: return TimeLimit(Reacher3Link(), max_episode_steps=50, max_episode_seconds=None)
def evaluate(self): test_env = TimeLimit(gym.make('PepperPush-v0'), max_episode_steps=100) obs = test_env.reset() results = evaluate_policy(self.model, test_env, n_eval_episodes=75, return_episode_rewards=False) return results[0]
def run_episode(env: TimeLimit, estimator: CNN_DQN): obs = env.reset() state = get_state(obs) is_done = False total_reward = 0 while not is_done: actionIndex = torch.argmax(estimator.predict(state)).item() action = ACTIONS[actionIndex] obs, reward, is_done, info = env.step(action) state = get_state(obs) total_reward += reward return total_reward
def run_discrete(environment_name, mapping, shape=None, problem=None): if problem is None: problem = gym.make(environment_name) else: from gym.wrappers.time_limit import TimeLimit problem = TimeLimit(problem) print('== {} =='.format(environment_name)) print('Actions:', problem.env.action_space.n) print('States:', problem.env.observation_space.n) print(problem.env.desc) print() print('== Value Iteration ==') value_policy, iters = value_iteration(problem) print('Iterations:', iters) print() print('== Policy Iteration ==') policy, iters = policy_iteration(problem) print('Iterations:', iters) print() diff = sum( [abs(x - y) for x, y in zip(policy.flatten(), value_policy.flatten())]) if diff > 0: print('Discrepancy:', diff) print() if shape is not None: print('== Policy ==') print_policy(policy, mapping, shape) print() return policy
def get_agent_and_runner(max_timesteps=EPISODE_MAX_LENGTH): max_timesteps = EPISODE_MAX_LENGTH if max_timesteps is None else max_timesteps # OpenAI-Gym environment specification gym_environment = gym.make(LEVEL, render=True) gym_environment = TimeLimit(gym_environment.unwrapped, max_episode_steps=max_timesteps) # gym_environment = Monitor(gym_environment, RECORD_DICT, force=True) environment = Environment.create( environment=gym_environment, max_episode_timesteps=gym_environment.spec.max_episode_steps, ) agent = Agent.create( agent='a2c', environment=environment, # parallel_interactions=PARALLEL, # Automatically configured network # network='auto', network=[ dict(type='dense', size=256, activation='tanh'), dict(type='dense', size=256, activation='tanh'), dict(type='dense', size=256, activation='tanh'), ], # AC optimization parameters batch_size=256, update_frequency=2, learning_rate=0.001, # Reward estimation discount=0.99, predict_terminal_values=False, # Regularization l2_regularization=1.0, entropy_regularization=0.0, # Preprocessing state_preprocessing='linear_normalization', reward_preprocessing=None, # Exploration exploration=0.3, variable_noise=0.2, # Default additional config values config=None, # Save agent every 10 updates and keep the 5 most recent checkpoints saver=dict(directory=MODEL_DICT, frequency=10, max_checkpoints=5), # Log all available Tensorboard summaries summarizer=dict(directory=SUMMARY_DICT, summaries='all'), # Do not record agent-environment interaction trace recorder=None # RECORD_DICT ) # Initialize the runner runner = Runner( agent=agent, environment=environment, max_episode_timesteps=gym_environment.spec.max_episode_steps, # num_parallel=PARALLEL, # remote="multiprocessing" ) return agent, runner
def make_env(env_name, max_episode_steps, episode_life=True): env = gym.make(env_name) env_group_title = get_env_group_title(env) # print(env_group_title, env_name) if env_group_title == 'gym_minatar': env = make_minatar(env, max_episode_steps, scale=False) if len(env.observation_space.shape) == 3: env = TransposeImage(env) elif env_group_title == 'atari' and '-ram' in env_name: make_atari_ram(env, max_episode_steps, scale=True) elif env_group_title == 'atari': env = make_atari(env, max_episode_steps) env = ReturnWrapper(env) env = wrap_deepmind(env, episode_life=episode_life, clip_rewards=False, frame_stack=False, scale=False) if len(env.observation_space.shape) == 3: env = TransposeImage(env) env = FrameStack(env, 4) elif env_group_title in [ 'classic_control', 'box2d', 'gym_pygame', 'gym_exploration', 'pybullet', 'mujoco', 'robotics' ]: if max_episode_steps > 0: # Set max episode steps env = TimeLimit(env.unwrapped, max_episode_steps) return env
def make(self, **kwargs) -> Env: """Instantiates an instance of the environment with appropriate kwargs""" if self.entry_point is None: raise error.Error( f"Attempting to make deprecated env {self.id}. " "(HINT: is there a newer registered version of this env?)") _kwargs = self.kwargs.copy() _kwargs.update(kwargs) if callable(self.entry_point): env = self.entry_point(**_kwargs) else: cls = load(self.entry_point) env = cls(**_kwargs) # Make the environment aware of which spec it came from. spec = copy.deepcopy(self) spec.kwargs = _kwargs env.unwrapped.spec = spec if self.order_enforce: from gym.wrappers.order_enforcing import OrderEnforcing env = OrderEnforcing(env) assert env.spec is not None, "expected spec to be set to the unwrapped env." if env.spec.max_episode_steps is not None: from gym.wrappers.time_limit import TimeLimit env = TimeLimit(env, max_episode_steps=env.spec.max_episode_steps) return env
def meta_reset(self, seed): np.random.seed(seed) env = RandomWeightHopperEnv(rand_mass=self.rand_mass, rand_gravity=self.rand_gravity, rand_friction=self.rand_friction, rand_thickness=self.rand_thickness) # Based on Hopper-v2 spec = EnvSpec( 'RandomWeightHopperEnv-v0', entry_point='generic_rl.envs.mujoco:RandomWeightHopperEnv', max_episode_steps=1000, reward_threshold=3800.0 ) env._spec = spec env.seed(seed) # Wrap the env as needed env = TimeLimit( env, max_episode_steps=spec.max_episode_steps, max_episode_seconds=spec.max_episode_seconds ) self.env = env # Fix for done flags. self.env.reset() self.step = env.step self.render = env.render self.reset = env.reset
def meta_reset(self, seed): np.random.seed(seed) env = NormalHopperEnv() # Based on Hopper-v2 spec = EnvSpec( 'NormalHopperEnv-v0', entry_point='generic_rl.envs.mujoco:NormalHopperEnv', max_episode_steps=1000, reward_threshold=3800.0 ) env._spec = spec env.seed(seed) # Wrap the env as needed env = TimeLimit( env, max_episode_steps=spec.max_episode_steps, max_episode_seconds=spec.max_episode_seconds ) self.env = env # Fix for done flags. self.env.reset() self.step = env.step self.render = env.render self.reset = env.reset
def test_remove_time_limit(): env = gym.make("MsPacmanNoFrameskip-v4") env = TransformReward(TimeLimit(AtariPreprocessing(env)), lambda x: x) rem_env = remove_time_limit(env) assert rem_env.spec.max_episode_steps == int(1e100) assert not isinstance(rem_env.env, TimeLimit) assert "TimeLimit" not in str(rem_env)
def __init__(self, paramters={}): self.paramters = paramters self.env = TimeLimit(gym.make('PepperPush-v0'), max_episode_steps=100) policy_kwargs = dict(activation_fn=th.nn.ReLU, net_arch=[256, 256, 256]) if "net_arch" in paramters: policy_kwargs["net_arch"] = paramters["net_arch"] self.model = HER( paramters.get("policy", 'MlpPolicy'), self.env, SAC, online_sampling=paramters.get("online_sampling", False), verbose=paramters.get("verbose", 0), max_episode_length=paramters.get("max_episode_length", 100), buffer_size=paramters.get("buffer_size", 1000000), batch_size=paramters.get("batch_size", 256), learning_rate=paramters.get("learning_rate", 0.001), learning_starts=paramters.get("learning_starts", 500), n_sampled_goal=paramters.get("n_sampled_goal", 4), gamma=paramters.get("gamma", 0.95), goal_selection_strategy=paramters.get("goal_selection_strategy", 'future'), ent_coef=paramters.get("ent_coef", 'auto'), policy_kwargs=policy_kwargs, train_freq=paramters.get("train_freq", 1), tensorboard_log=paramters.get("tensorboard_log", "./data/0_tensorboard/"))
def load_environment(env_name): """Outputs a wrapped gym environment.""" environment = gym.make(env_name) environment = TimeLimit(environment, max_episode_steps=1000) environment = wrappers.gym_wrapper.GymWrapper(environment) environment = wrappers.SinglePrecisionWrapper(environment) return environment
def __init__(self, actor_id, game, seed, env_class=None, visualize=False, agent_history_length=1, random_start=False, partially_observed=False): try: self.env = gym.make(game) try: self.desc = self.env.unwrapped.desc except: self.desc = None except (NameError, ImportError): assert env_class is not None, "The specified environment does not seem to be a registered Gym environment: env_class cannot be None." spec = registry.spec(game) self.env = env_class(**spec._kwargs) self.env.unwrapped._spec = spec self.desc = self.env.desc self.env = TimeLimit(self.env, max_episode_steps=self.env.spec.max_episode_steps, max_episode_seconds=self.env.spec.max_episode_seconds) self.env.seed(seed * (actor_id + 1)) if partially_observed: self.env = PartiallyObservedCorridor(self.env) else: self.env = OneHotObservation(self.env) if agent_history_length > 1: self.env = ObsStack(self.env, agent_history_length) self.agent_history_length = agent_history_length self.num_actions = self.env.action_space.n self.gym_actions = list(range(self.env.action_space.n)) self.visualize = visualize self.grid_shape = self.desc.shape self.game = game self.np_random, seed = seeding.np_random(seed)
def make_env(envName): env = { 'KukaReach-v1': ReachEnv, 'KukaPickAndPlaceObstacle-v1': PickObstacleEnv, 'KukaPickAndPlaceObstacle-v2': PickObstacleEnvV2, 'KukaPickNoObstacle-v1': PickNoObstacleEnv, 'KukaPickNoObstacle-v2': PickNoObstacleEnvV2, 'KukaPickThrow-v1': PickThrowEnv, 'KukaPushLabyrinth-v1': PushLabyrinthEnv, 'KukaPushLabyrinth-v2': PushLabyrinthEnvV2, 'KukaPushSlide-v1': PushSlide, 'KukaPushNew-v1': PushNewEnv }[envName]() MAXEPISODESTEPS = { 'KukaReach-v1': 50, 'KukaPickAndPlaceObstacle-v1': 100, 'KukaPickAndPlaceObstacle-v2': 100, 'KukaPickNoObstacle-v1': 100, 'KukaPickNoObstacle-v2': 100, 'KukaPickThrow-v1': 100, 'KukaPushLabyrinth-v1': 100, 'KukaPushSlide-v1': 100, 'KukaPushNew-v1': 200, 'KukaPushLabyrinth-v2': 100 }[envName] env = TimeLimit(env, max_episode_steps=MAXEPISODESTEPS) return env
def create(): for import_name in imports: importlib.import_module(import_name) if isinstance(env_type, str): assert (not kwargs), "ENV kwargs not supported for gym envs" env = gym.make(env_type) elif callable(env_type): env = env_type(**kwargs) else: raise ValueError("make_env_creator() expected either a string or " f"callable for 'env_type', got {type(env_type)}") # Limit the max steps per episode if requested if max_episode_steps is not None: if hasattr(env, "_max_episode_steps"): # Use the '_max_episode_steps' if available from gym. This is # to allow increasing the limit for example in cartpole. # (The TimeLimit option can only decrease the limit) env._max_episode_steps = max_episode_steps else: env = TimeLimit(env, max_episode_steps) # Always begin with EpisodeTracker so that the training gets the real # rewards/dones before any additional wrappers process them env = EpisodeTracker(env) # Apply all requested wrappers for wrapper in wrappers: wrapper_type = wrapper.get("type") wrapper_args = wrapper.get("args", {}) env = wrapper_type(env, **wrapper_args) return env
def __init__(self, env, **kwargs): if (env.spec.timestep_limit is not None) and not env.spec.tags.get('vnc'): from gym.wrappers.time_limit import TimeLimit env = TimeLimit(env, max_episode_steps=env.spec.max_episode_steps, max_episode_seconds=env.spec.max_episode_seconds) super(Environment, self).__init__(env)
def make_atari(env, max_episode_steps): # assert 'NoFrameskip' in env.spec.id print('set time limit:', max_episode_steps) env = NoopResetEnv(env, noop_max=30) env = MaxAndSkipEnv(env, skip=4) if max_episode_steps > 0: env = TimeLimit(env, max_episode_steps=max_episode_steps) return env
def Monitor(env=None, directory=None, video_callable=None, force=False, resume=False, write_upon_reset=False, uid=None, mode=None): if not isinstance(env, gym.Env): raise error.Error("Monitor decorator syntax is deprecated as of 12/28/2016. Replace your call to `env = gym.wrappers.Monitor(directory)(env)` with `env = gym.wrappers.Monitor(env, directory)`") # TODO: add duration in seconds also return _Monitor(TimeLimit(env, max_episode_steps=env.spec.timestep_limit), directory, video_callable, force, resume, write_upon_reset, uid, mode)
def __init__(self, env): TimeLimit.__init__(self, env) self.observation_scale = np.ones_like(self.observation_space.high) self.observation_shift = np.zeros_like(self.observation_space.high) self.action_scale = (self.action_space.high - self.action_space.low) / 2 self.action_shift = (self.action_space.high + self.action_space.low) / 2 self.rewards_scale = 1e-1 self.rewards_shift = 0. # update observation and action space self.observation_space = gym.spaces.Box(self.normalize_observation(self.observation_space.low), self.normalize_observation(self.observation_space.high)) self.action_space = gym.spaces.Box(-np.ones_like(self.action_space.high), np.ones_like(self.action_space.high))
def step(self, action): normalized_action = np.clip(self.normalized_action(action), self.action_space.high, self.action_space.low) obs, reward, term, info = TimeLimit.step(self, normalized_action) normalized_observation = self.normalized_observation(obs) return normalized_observation, reward, term, info
def make_policy_env(): repeats = 3 pol_env = RestartablePendulumEnv( repeats=repeats, pixels=True) # can specify cost="dm_control" pol_env = TimeLimit(pol_env, max_episode_steps=int( 200 / repeats)) # only run the environment for 200 true steps proj = np.eye(rep_model.enc_dim) return ew.TorchEncoderWrapper(pol_env, encnet, proj)
def make_atari(env_id, max_episode_steps=None): env = gym.make(env_id) env.spec.id = 'NoFrameskip' assert 'NoFrameskip' in env.spec.id env = NoopResetEnv(env, noop_max=30) env = MaxAndSkipEnv(env, skip=4) if max_episode_steps is not None: env = TimeLimit(env, max_episode_steps=max_episode_steps) return env
def render_episode(env: TimeLimit, estimator: CNN_DQN): obs = env.reset() state = get_state(obs) is_done = False while not is_done: sleep(0.0415) # (~24 fps) rgb = env.render('rgb_array') upscaled = repeat_upsample(rgb, 3, 4) viewer.imshow(upscaled) actionIndex = torch.argmax(estimator.predict(state)).item() action = ACTIONS[actionIndex] obs, reward, is_done, _ = env.step(action) if reward != 0: print(reward) state = get_state(obs) env.close()
def make(self, id): logger.info('Making new env: %s', id) spec = self.spec(id) env = spec.make() if (env.spec.timestep_limit is not None) and not spec.tags.get('vnc'): from gym.wrappers.time_limit import TimeLimit env = TimeLimit(env, max_episode_steps=env.spec.max_episode_steps, max_episode_seconds=env.spec.max_episode_seconds) return env
def make_atari_ram(env, max_episode_steps, scale=True, episode_life=True): env = NoopResetEnv(env, noop_max=30) env = MaxAndSkipEnv(env, skip=4) if max_episode_steps > 0: env = TimeLimit(env, max_episode_steps=max_episode_steps) if scale: env = ScaledFloatFrame(env) if episode_life: env = EpisodicLifeEnv(env) return env
def make(self): env = ValueWrapper( TimeLimit( ToyVMEnv(self.slate_size), max_episode_steps=self.max_episode_steps, ), zero_augment, ) if self.initial_seed: env.seed(self.initial_seed) return env
def make(self, id): logger.info('Making new env: %s', id) spec = self.spec(id) env = spec.make() if hasattr(env, "_reset") and hasattr(env, "_step"): patch_deprecated_methods(env) if (env.spec.timestep_limit is not None) and not spec.tags.get('vnc'): from gym.wrappers.time_limit import TimeLimit env = TimeLimit(env, max_episode_steps=env.spec.max_episode_steps, max_episode_seconds=env.spec.max_episode_seconds) return env
def collect_fixed_set_of_states(conf: dict, env: TimeLimit) -> list: # Collect samples to evaluate the agent on a fixed set of samples # (DQN paper). Collect a fixed set of states by running a random policy # before training starts and track the average of the maximum predicted # Q for these states. env.reset() exclude = conf['preprocess']['exclude'] fixed_states = [] while True: action = env.action_space.sample() next_state, reward, done, _ = env.step(action) state = next_state preprocessed_state = preprocess_frame(state, exclude) fixed_states.append(preprocessed_state) if done: break env.close() print(f'Collected {len(fixed_states)} fixed set of states!') return fixed_states
def load_d4rl(env_name, default_time_limit=1000): """Loads the python environment from D4RL.""" gym_env = gym.make(env_name) gym_spec = gym.spec(env_name) # Default to env time limit unless it is not specified. if gym_spec.max_episode_steps in [0, None]: gym_env = TimeLimit(gym_env, max_episode_steps=default_time_limit) # Wrap TF-Agents environment. env = gym_wrapper.GymWrapper(gym_env) return env