def make(env_name, frame_stack, action_repeat, seed): domain, task = split_env_name(env_name) if domain == 'manip': env = manipulation.load(f'{task}_vision', seed=seed) else: env = suite.load(domain, task, task_kwargs={'random': seed}, visualize_reward=False) # apply action repeat and scaling env = ActionRepeatWrapper(env, action_repeat) env = action_scale.Wrapper(env, minimum=-1.0, maximum=+1.0) # flatten features env = FlattenObservationWrapper(env) if domain != 'manip': # per dreamer: https://github.com/danijar/dreamer/blob/02f0210f5991c7710826ca7881f19c64a012290c/wrappers.py#L26 camera_id = 2 if domain == 'quadruped' else 0 render_kwargs = {'height': 84, 'width': 84, 'camera_id': camera_id} env = pixels.Wrapper(env, pixels_only=False, render_kwargs=render_kwargs) env = FrameStackWrapper(env, frame_stack) action_spec = env.action_spec() assert np.all(action_spec.minimum >= -1.0) assert np.all(action_spec.maximum <= +1.0) return env
def __init__(self, args: argparse.Namespace): assert args.env_batch_size > 0 assert args.environment_name in CONTROL_SUITE_ENVS assert args.max_episode_length > 0 from dm_control import suite from dm_control.suite.wrappers import pixels domain, task = args.environment_name.split('-') self._envs = [ suite.load(domain_name=domain, task_name=task, task_kwargs={'time_limit': np.inf}) for _ in range(args.env_batch_size) ] self._envs = [ pixels.Wrapper(env, render_kwargs={'camera_id': 0}) for env in self._envs ] # Time step counter self._t = 0 # Set time step limit self._max_t = args.max_episode_length # Check whether images or states should be observed self._state_obs = args.state_observations # Get bit depth for preprocessing the observations self._bit_depth = args.bit_depth # Get the size of observations self._observation_size = (32, 32) if args.downscale_observations else (64, 64)
def test_single_array_observation(self, pixels_only): pixel_key = 'depth' env = FakeArrayObservationEnvironment() observation_spec = env.observation_spec() self.assertIsInstance(observation_spec, specs.ArraySpec) wrapped = pixels.Wrapper(env, observation_key=pixel_key, pixels_only=pixels_only) wrapped_observation_spec = wrapped.observation_spec() self.assertIsInstance(wrapped_observation_spec, collections.OrderedDict) if pixels_only: self.assertEqual(1, len(wrapped_observation_spec)) self.assertEqual([pixel_key], list(wrapped_observation_spec.keys())) else: self.assertEqual(2, len(wrapped_observation_spec)) self.assertEqual([pixels.STATE_KEY, pixel_key], list(wrapped_observation_spec.keys())) time_step = wrapped.reset() depth_observation = time_step.observation[pixel_key] wrapped_observation_spec[pixel_key].validate(depth_observation) self.assertEqual(depth_observation.shape, (4, 5, 3)) self.assertEqual(depth_observation.dtype, np.uint8)
def make_dm_control(env_name, env_config): from dm_control import suite from dm_control.suite.wrappers import pixels from .dm_wrapper import DMControlAdapter, DMControlDummyWrapper pixel_input = env_config.pixel_input domain_name, task_name = env_name.split('-') env = suite.load(domain_name=domain_name, task_name=task_name) if pixel_input: if os.getenv('DISABLE_MUJOCO_RENDERING'): # We are asking for rendering on a pod that cannot support rendering, # This happens in GPU based learners when we only want to create the environment # to see the dimensions. # So we will add a dummy environment # TODO: add a dummy wrapper that only contains the correct specs env = DMControlDummyWrapper(env) #... else: env = pixels.Wrapper(env, render_kwargs={'height': 84, 'width': 84, 'camera_id': 0}) # TODO: what to do with reward visualization # Reward visualization should only be done in the eval agent # env = suite.load(domain_name=domain_name, task_name=task_name, visualize_reward=record_video) env = DMControlAdapter(env, pixel_input) env = FilterWrapper(env, env_config) env = ObservationConcatenationWrapper(env) if pixel_input: env = TransposeWrapper(env) env = GrayscaleWrapper(env) if env_config.frame_stacks > 1: env = FrameStackWrapper(env, env_config) env_config.action_spec = env.action_spec() env_config.obs_spec = env.observation_spec() return env, env_config
def main(): parser = argparse.ArgumentParser(description='Test learned model') parser.add_argument('dir', type=str, help='log directory to load learned model') parser.add_argument('--render', action='store_true') parser.add_argument('--domain-name', type=str, default='cheetah') parser.add_argument('--task-name', type=str, default='run') parser.add_argument('-R', '--action-repeat', type=int, default=2) parser.add_argument('--episodes', type=int, default=1) args = parser.parse_args() # define environment and apply wrapper env = suite.load(args.domain_name, args.task_name) env = pixels.Wrapper(env, render_kwargs={ 'height': 64, 'width': 64, 'camera_id': 0 }) env = GymWrapper(env) env = RepeatAction(env, skip=args.action_repeat) # define models with open(os.path.join(args.dir, 'args.json'), 'r') as f: train_args = json.load(f) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') encoder = Encoder().to(device) rssm = RecurrentStateSpaceModel(train_args['state_dim'], env.action_space.shape[0], train_args['rnn_hidden_dim']).to(device) action_model = ActionModel(train_args['state_dim'], train_args['rnn_hidden_dim'], env.action_space.shape[0]).to(device) # load learned parameters encoder.load_state_dict(torch.load(os.path.join(args.dir, 'encoder.pth'))) rssm.load_state_dict(torch.load(os.path.join(args.dir, 'rssm.pth'))) action_model.load_state_dict( torch.load(os.path.join(args.dir, 'action_model.pth'))) # define agent policy = Agent(encoder, rssm, action_model) # test learnged model in the environment for episode in range(args.episodes): policy.reset() obs = env.reset() done = False total_reward = 0 while not done: action = policy(obs) obs, reward, done, _ = env.step(action) total_reward += reward if args.render: env.render(height=256, width=256, camera_id=0) print('Total test reward at episode [%4d/%4d] is %f' % (episode + 1, args.episodes, total_reward))
def __init__(self, env_name, seed, max_episode_length, bit_depth): domain, task = env_name.split('-') self._env = suite.load(domain_name=domain, task_name=task, task_kwargs={'random': seed}) self._env = pixels.Wrapper(self._env) self.max_episode_length = max_episode_length self.action_repeat = CONTROL_SUITE_ACTION_REPEATS[domain] self.bit_depth = bit_depth
def test_envs_same(self): # Test that the camera augmentations with magnitude 0 gives the same results # as when no camera augmentations are used. render_kwargs = {'width': 84, 'height': 84, 'camera_id': 0} domain_and_task = [('cartpole', 'swingup'), ('reacher', 'easy'), ('finger', 'spin'), ('cheetah', 'run'), ('ball_in_cup', 'catch'), ('walker', 'walk')] for (domain, task) in domain_and_task: seed = 42 envs = [('baseline', pixels.Wrapper( dm_control_suite.load( domain, task, task_kwargs={'random': seed}), render_kwargs=render_kwargs)), ('no-wrapper', pixels.Wrapper( dm_control_suite.load( domain, task, task_kwargs={'random': seed}), render_kwargs=render_kwargs)), ('w/-camera_kwargs', pixels.Wrapper( distraction_wrap( dm_control_suite.load( domain, task, task_kwargs={'random': seed}), domain), render_kwargs=render_kwargs))] frames = [] for _, env in envs: random_state = np.random.RandomState(42) action_spec = env.action_spec() time_step = env.reset() frames.append([]) while not time_step.last() and len(frames[-1]) < 20: action = random_state.uniform( action_spec.minimum, action_spec.maximum, size=action_spec.shape) time_step = env.step(action) frame = time_step.observation['pixels'][:, :, 0:3] frames[-1].append(frame) frames_np = np.array(frames) for i in range(1, len(envs)): difference = np.mean(abs(frames_np[0] - frames_np[i])) self.assertEqual(difference, 0.)
def __init__(self, env): self._env = pixels.Wrapper(pixels.Wrapper(env), pixels_only=True) action_spec = self._env.action_spec() time_step = self._env.reset() observation_dm = time_step.observation["pixels"] screen_height_dm = observation_dm.shape[0] screen_width_dm = observation_dm.shape[1] screen_depth_dm = observation_dm.shape[2] self.observation_space = Box(low=0, high=255, shape=(screen_height_dm, screen_width_dm, screen_depth_dm), dtype=np.uint8) self.action_space = Box(action_spec.minimum, action_spec.maximum, dtype=np.float32) self.random_action = np.random.uniform(action_spec.minimum, action_spec.maximum, size=action_spec.shape)
def load_pixels( domain_name: Text, task_name: Text, observation_key: Text = 'pixels', pixels_only: bool = True, task_kwargs=None, environment_kwargs=None, visualize_reward: bool = False, render_kwargs=None, env_wrappers: Sequence[types.PyEnvWrapper] = () ) -> py_environment.PyEnvironment: """Returns an environment from a domain name, task name and optional settings. Args: domain_name: A string containing the name of a domain. task_name: A string containing the name of a task. observation_key: Optional custom string specifying the pixel observation's key in the `OrderedDict` of observations. Defaults to 'pixels'. pixels_only: If True (default), the original set of 'state' observations returned by the wrapped environment will be discarded, and the `OrderedDict` of observations will only contain pixels. If False, the `OrderedDict` will contain the original observations as well as the pixel observations. task_kwargs: Optional `dict` of keyword arguments for the task. environment_kwargs: Optional `dict` specifying keyword arguments for the environment. visualize_reward: Optional `bool`. If `True`, object colours in rendered frames are set to indicate the reward at each step. Default `False`. render_kwargs: Optional `dict` of keyword arguments for rendering. env_wrappers: Iterable with references to wrapper classes to use on the wrapped environment. Returns: The requested environment. Raises: ImportError: if dm_control module was not available. """ dm_env = _load_env(domain_name, task_name, task_kwargs=task_kwargs, environment_kwargs=environment_kwargs, visualize_reward=visualize_reward) dm_env = pixels.Wrapper(dm_env, pixels_only=pixels_only, render_kwargs=render_kwargs, observation_key=observation_key) env = dm_control_wrapper.DmControlWrapper(dm_env, render_kwargs) for wrapper in env_wrappers: env = wrapper(env) return env
def __init__(self, env, symbolic, seed, max_episode_length, action_repeat, bit_depth): from dm_control import suite from dm_control.suite.wrappers import pixels domain, task = env.split('-') self.symbolic = symbolic self._env = suite.load(domain_name=domain, task_name=task, task_kwargs={'random': seed}) if not symbolic: self._env = pixels.Wrapper(self._env) self.max_episode_length = max_episode_length self.action_repeat = action_repeat if action_repeat != CONTROL_SUITE_ACTION_REPEATS[domain]: print('Using action repeat %d; recommended action repeat for domain is %d' % (action_repeat, CONTROL_SUITE_ACTION_REPEATS[domain])) self.bit_depth = bit_depth
def __init__(self, environment: control.Environment, *, height: int = 84, width: int = 84, camera_id: int = 0): render_kwargs = { 'height': height, 'width': width, 'camera_id': camera_id } pixel_environment = pixels.Wrapper(environment, pixels_only=True, render_kwargs=render_kwargs) super().__init__(pixel_environment)
def test_dynamic(self): camera_kwargs = get_camera_params( domain_name='cartpole', scale=0.1, dynamic=True) env = cartpole.swingup() env = camera.DistractingCameraEnv(env, camera_id=0, **camera_kwargs) env = pixels.Wrapper(env, render_kwargs={'camera_id': 0}) action_spec = env.action_spec() time_step = env.reset() frames = [] while not time_step.last() and len(frames) < 10: action = np.random.uniform( action_spec.minimum, action_spec.maximum, size=action_spec.shape) time_step = env.step(action) frames.append(time_step.observation['pixels']) self.assertEqual(frames[0].shape, (240, 320, 3))
def __init__(self, env, symbolic, seed, max_episode_length, action_repeat, bit_depth, action_noise_scale=None, render_size=64, use_rgbgr=False): from dm_control import suite from dm_control.suite.wrappers import pixels from dm_control.suite.wrappers import action_noise domain, task = env.split('-') self.symbolic = symbolic self.render_size = render_size if render_size else 64 self.use_rgbgr = use_rgbgr if self.use_rgbgr: self.obs_tuple = (4, render_size, render_size) else: self.obs_tuple = (3, render_size, render_size) self.action_noise_scale = action_noise_scale self._env = suite.load(domain_name=domain, task_name=task, task_kwargs={'random': seed}) if not symbolic: self._env = pixels.Wrapper(self._env) if self.action_noise_scale is not None: self._env = action_noise.Wrapper(self._env, scale=self.action_noise_scale) self.max_episode_length = max_episode_length if action_repeat < 0: try: action_repeat = CONTROL_SUITE_ACTION_REPEATS[domain] except KeyError: action_repeat = 2 self.action_repeat = action_repeat try: if action_repeat != CONTROL_SUITE_ACTION_REPEATS[domain]: print( 'Using action repeat %d; recommended action repeat for domain is %d' % (action_repeat, CONTROL_SUITE_ACTION_REPEATS[domain])) except KeyError: pass self.bit_depth = bit_depth
def load( domain_name, task_name, task_kwargs=None, environment_kwargs=None, env_load_fn=suite.load, # use custom_suite.load for customized env action_repeat_wrapper=wrappers.ActionRepeat, action_repeat=1, frame_stack=4, episode_length=1000, actions_in_obs=True, rewards_in_obs=False, pixels_obs=True, # Render params grayscale=False, visualize_reward=False, render_kwargs=None): """Returns an environment from a domain name, task name.""" env = env_load_fn(domain_name, task_name, task_kwargs=task_kwargs, environment_kwargs=environment_kwargs, visualize_reward=visualize_reward) if pixels_obs: env = pixel_wrapper.Wrapper(env, pixels_only=False, render_kwargs=render_kwargs) env = dm_control_wrapper.DmControlWrapper(env, render_kwargs) if pixels_obs and grayscale: env = GrayscaleWrapper(env) if action_repeat > 1: env = action_repeat_wrapper(env, action_repeat) if pixels_obs: env = FrameStack(env, frame_stack, actions_in_obs, rewards_in_obs) else: env = FlattenState(env) # Adjust episode length based on action_repeat max_episode_steps = (episode_length + action_repeat - 1) // action_repeat # Apply a time limit wrapper at the end to properly trigger all reset() env = wrappers.TimeLimit(env, max_episode_steps) return env
def test_dict_observation(self, pixels_only): pixel_key = 'rgb' env = cartpole.swingup() # Make sure we are testing the right environment for the test. observation_spec = env.observation_spec() self.assertIsInstance(observation_spec, collections.OrderedDict) width = 320 height = 240 # The wrapper should only add one observation. wrapped = pixels.Wrapper(env, observation_key=pixel_key, pixels_only=pixels_only, render_kwargs={ 'width': width, 'height': height }) wrapped_observation_spec = wrapped.observation_spec() self.assertIsInstance(wrapped_observation_spec, collections.OrderedDict) if pixels_only: self.assertEqual(1, len(wrapped_observation_spec)) self.assertEqual([pixel_key], list(wrapped_observation_spec.keys())) else: self.assertEqual( len(observation_spec) + 1, len(wrapped_observation_spec)) expected_keys = list(observation_spec.keys()) + [pixel_key] self.assertEqual(expected_keys, list(wrapped_observation_spec.keys())) # Check that the added spec item is consistent with the added observation. time_step = wrapped.reset() rgb_observation = time_step.observation[pixel_key] wrapped_observation_spec[pixel_key].validate(rgb_observation) self.assertEqual(rgb_observation.shape, (height, width, 3)) self.assertEqual(rgb_observation.dtype, np.uint8)
def __init__(self, env_id, seed, max_episode_length=1000): super(ControlSuiteEnv, self).__init__() domain, task = env_id.split("-") from dm_control import suite from dm_control.suite.wrappers import pixels self._env = suite.load(domain_name=domain, task_name=task, task_kwargs={"random": seed}) self._env = pixels.Wrapper(self._env) self._env.action_space = self.action_size self._env.observation_space = self.observation_size self._env.reward_range = (-float("inf"), float("inf")) self._env.metadata = self.metadata self._env.spec = None self._env = UnSuite(self._env) self.action_repeat = CONTROL_SUITE_ACTION_REPEATS.get(domain, 1) self.max_episode_length = max_episode_length * self.action_repeat if self.action_repeat != CONTROL_SUITE_ACTION_REPEATS[domain]: print("Using action repeat %d; recommended action repeat for domain is %d" % ( self.action_repeat, CONTROL_SUITE_ACTION_REPEATS[domain])) self.t = 0
def __init__( self, domain, task, frame_skip=1, normalize=False, pixel_wrapper_kwargs=None, task_kwargs={}, environment_kwargs={}, max_path_length=1200, ): save__init__args(locals(), underscore=True) env = suite.load(domain_name=domain, task_name=task, task_kwargs=task_kwargs, environment_kwargs=environment_kwargs) if normalize: np.testing.assert_equal(env.action_spec().minimum, -1) np.testing.assert_equal(env.action_spec().maximum, 1) if pixel_wrapper_kwargs is not None: env = pixels.Wrapper(env, **pixel_wrapper_kwargs) self._env = env self._observation_keys = tuple(env.observation_spec().keys()) observation_space = convert_dm_control_to_rlpyt_space( env.observation_spec()) self._observation_space = observation_space action_space = convert_dm_control_to_rlpyt_space(env.action_spec()) if len(action_space.shape) > 1: raise NotImplementedError( "Shape of the action space ({}) is not flat, make sure to" " check the implemenation.".format(action_space)) self._action_space = action_space self._step_count = 0
import torch import numpy as np # import os # os.environ["MUJOCO_GL"] = 'osmesa' from dm_control import suite from dm_control.suite.wrappers import pixels import utils env = suite.load(domain_name="humanoid", task_name="stand") # import ipdb; ipdb.set_trace() env = pixels.Wrapper(env) spec = env.action_spec() time_step = env.reset() total_reward = 0.0 frames = [time_step.observation['pixels']] for t in range(1000): print(t) action = np.random.uniform(spec.minimum, spec.maximum, spec.shape) time_step = env.step(action) frames.append(time_step.observation['pixels'].copy()) total_reward += time_step.reward print("Total number of frames: {}".format(len(frames))) # utils.save_gif('humanoid.mp4', # [torch.tensor(frame.copy()).float()/255 for frame in frames], # color_last=True)
def __init__(self, domain, task, *args, env=None, normalize=True, observation_keys=(), goal_keys=(), unwrap_time_limit=True, pixel_wrapper_kwargs=None, **kwargs): assert not args, ( "Gym environments don't support args. Use kwargs instead.") self.normalize = normalize self.unwrap_time_limit = unwrap_time_limit super(DmControlAdapter, self).__init__( domain, task, *args, goal_keys=goal_keys, **kwargs) if env is None: assert (domain is not None and task is not None), (domain, task) env = suite.load( domain_name=domain, task_name=task, task_kwargs=kwargs # TODO(hartikainen): Figure out how to pass kwargs to this guy. # Need to split into `task_kwargs`, `environment_kwargs`, and # `visualize_reward` bool. Check the suite.load(.) in: # https://github.com/deepmind/dm_control/blob/master/dm_control/suite/__init__.py ) self._env_kwargs = kwargs else: assert not kwargs assert domain is None and task is None, (domain, task) # Ensure action space is already normalized. if normalize: np.testing.assert_equal(env.action_spec().minimum, -1) np.testing.assert_equal(env.action_spec().maximum, 1) if pixel_wrapper_kwargs is not None: env = pixels.Wrapper(env, **pixel_wrapper_kwargs) self._env = env assert isinstance(env.observation_spec(), OrderedDict) self.observation_keys = ( observation_keys or tuple(env.observation_spec().keys())) observation_space = convert_dm_control_to_gym_space( env.observation_spec()) self._observation_space = type(observation_space)([ (name, copy.deepcopy(space)) for name, space in observation_space.spaces.items() if name in self.observation_keys ]) action_space = convert_dm_control_to_gym_space(self._env.action_spec()) if len(action_space.shape) > 1: raise NotImplementedError( "Shape of the action space ({}) is not flat, make sure to" " check the implemenation.".format(action_space)) self._action_space = action_space
def main(): parser = argparse.ArgumentParser(description='Dreamer for DM control') parser.add_argument('--seed', type=int, default=0) parser.add_argument('--log-dir', type=str, default='log') parser.add_argument('--test-interval', type=int, default=10) parser.add_argument('--domain-name', type=str, default='cheetah') parser.add_argument('--task-name', type=str, default='run') parser.add_argument('-R', '--action-repeat', type=int, default=2) parser.add_argument('--state-dim', type=int, default=30) parser.add_argument('--rnn-hidden-dim', type=int, default=200) parser.add_argument('--buffer-capacity', type=int, default=1000000) parser.add_argument('--all-episodes', type=int, default=1000) parser.add_argument('-S', '--seed-episodes', type=int, default=5) parser.add_argument('-C', '--collect-interval', type=int, default=100) parser.add_argument('-B', '--batch-size', type=int, default=50) parser.add_argument('-L', '--chunk-length', type=int, default=50) parser.add_argument('-H', '--imagination-horizon', type=int, default=15) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--lambda_', type=float, default=0.95) parser.add_argument('--model_lr', type=float, default=6e-4) parser.add_argument('--value_lr', type=float, default=8e-5) parser.add_argument('--action_lr', type=float, default=8e-5) parser.add_argument('--eps', type=float, default=1e-4) parser.add_argument('--clip-grad-norm', type=int, default=100) parser.add_argument('--free-nats', type=int, default=3) parser.add_argument('--action-noise-var', type=float, default=0.3) args = parser.parse_args() # Prepare logging log_dir = os.path.join(args.log_dir, args.domain_name + '_' + args.task_name) log_dir = os.path.join(log_dir, datetime.now().strftime('%Y%m%d_%H%M')) os.makedirs(log_dir) with open(os.path.join(log_dir, 'args.json'), 'w') as f: json.dump(vars(args), f) pprint(vars(args)) writer = SummaryWriter(log_dir=log_dir) # set seed (NOTE: some randomness is still remaining (e.g. cuDNN's randomness)) np.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed(args.seed) # define env and apply wrappers env = suite.load(args.domain_name, args.task_name, task_kwargs={'random': args.seed}) env = pixels.Wrapper(env, render_kwargs={'height': 64, 'width': 64, 'camera_id': 0}) env = GymWrapper(env) env = RepeatAction(env, skip=args.action_repeat) # define replay buffer replay_buffer = ReplayBuffer(capacity=args.buffer_capacity, observation_shape=env.observation_space.shape, action_dim=env.action_space.shape[0]) # define models and optimizer device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') encoder = Encoder().to(device) rssm = RecurrentStateSpaceModel(args.state_dim, env.action_space.shape[0], args.rnn_hidden_dim).to(device) obs_model = ObservationModel(args.state_dim, args.rnn_hidden_dim).to(device) reward_model = RewardModel(args.state_dim, args.rnn_hidden_dim).to(device) model_params = (list(encoder.parameters()) + list(rssm.parameters()) + list(obs_model.parameters()) + list(reward_model.parameters())) model_optimizer = Adam(model_params, lr=args.model_lr, eps=args.eps) # define value model and action model and optimizer value_model = ValueModel(args.state_dim, args.rnn_hidden_dim).to(device) action_model = ActionModel(args.state_dim, args.rnn_hidden_dim, env.action_space.shape[0]).to(device) value_optimizer = Adam(value_model.parameters(), lr=args.value_lr, eps=args.eps) action_optimizer = Adam(action_model.parameters(), lr=args.action_lr, eps=args.eps) # collect seed episodes with random action for episode in range(args.seed_episodes): obs = env.reset() done = False while not done: action = env.action_space.sample() next_obs, reward, done, _ = env.step(action) replay_buffer.push(obs, action, reward, done) obs = next_obs # main training loop for episode in range(args.seed_episodes, args.all_episodes): # ----------------------------- # collect experiences # ----------------------------- start = time.time() policy = Agent(encoder, rssm, action_model) obs = env.reset() done = False total_reward = 0 while not done: action = policy(obs) action += np.random.normal(0, np.sqrt(args.action_noise_var), env.action_space.shape[0]) next_obs, reward, done, _ = env.step(action) replay_buffer.push(obs, action, reward, done) obs = next_obs total_reward += reward writer.add_scalar('total reward at train', total_reward, episode) print('episode [%4d/%4d] is collected. Total reward is %f' % (episode+1, args.all_episodes, total_reward)) print('elasped time for interaction: %.2fs' % (time.time() - start)) # update parameters of model, value model, action model start = time.time() for update_step in range(args.collect_interval): # --------------------------------------------------------------- # update model (encoder, rssm, obs_model, reward_model) # --------------------------------------------------------------- observations, actions, rewards, _ = \ replay_buffer.sample(args.batch_size, args.chunk_length) # preprocess observations and transpose tensor for RNN training observations = preprocess_obs(observations) observations = torch.as_tensor(observations, device=device) observations = observations.transpose(3, 4).transpose(2, 3) observations = observations.transpose(0, 1) actions = torch.as_tensor(actions, device=device).transpose(0, 1) rewards = torch.as_tensor(rewards, device=device).transpose(0, 1) # embed observations with CNN embedded_observations = encoder( observations.reshape(-1, 3, 64, 64)).view(args.chunk_length, args.batch_size, -1) # prepare Tensor to maintain states sequence and rnn hidden states sequence states = torch.zeros( args.chunk_length, args.batch_size, args.state_dim, device=device) rnn_hiddens = torch.zeros( args.chunk_length, args.batch_size, args.rnn_hidden_dim, device=device) # initialize state and rnn hidden state with 0 vector state = torch.zeros(args.batch_size, args.state_dim, device=device) rnn_hidden = torch.zeros(args.batch_size, args.rnn_hidden_dim, device=device) # compute state and rnn hidden sequences and kl loss kl_loss = 0 for l in range(args.chunk_length-1): next_state_prior, next_state_posterior, rnn_hidden = \ rssm(state, actions[l], rnn_hidden, embedded_observations[l+1]) state = next_state_posterior.rsample() states[l+1] = state rnn_hiddens[l+1] = rnn_hidden kl = kl_divergence(next_state_prior, next_state_posterior).sum(dim=1) kl_loss += kl.clamp(min=args.free_nats).mean() kl_loss /= (args.chunk_length - 1) # states[0] and rnn_hiddens[0] are always 0 and have no information states = states[1:] rnn_hiddens = rnn_hiddens[1:] # compute reconstructed observations and predicted rewards flatten_states = states.view(-1, args.state_dim) flatten_rnn_hiddens = rnn_hiddens.view(-1, args.rnn_hidden_dim) recon_observations = obs_model(flatten_states, flatten_rnn_hiddens).view( args.chunk_length-1, args.batch_size, 3, 64, 64) predicted_rewards = reward_model(flatten_states, flatten_rnn_hiddens).view( args.chunk_length-1, args.batch_size, 1) # compute loss for observation and reward obs_loss = 0.5 * mse_loss( recon_observations, observations[1:], reduction='none').mean([0, 1]).sum() reward_loss = 0.5 * mse_loss(predicted_rewards, rewards[:-1]) # add all losses and update model parameters with gradient descent model_loss = kl_loss + obs_loss + reward_loss model_optimizer.zero_grad() model_loss.backward() clip_grad_norm_(model_params, args.clip_grad_norm) model_optimizer.step() # ---------------------------------------------- # update value_model and action_model # ---------------------------------------------- # detach gradient because Dreamer doesn't update model with actor-critic loss flatten_states = flatten_states.detach() flatten_rnn_hiddens = flatten_rnn_hiddens.detach() # prepare tensor to maintain imaginated trajectory's states and rnn_hiddens imaginated_states = torch.zeros(args.imagination_horizon + 1, *flatten_states.shape, device=flatten_states.device) imaginated_rnn_hiddens = torch.zeros(args.imagination_horizon + 1, *flatten_rnn_hiddens.shape, device=flatten_rnn_hiddens.device) imaginated_states[0] = flatten_states imaginated_rnn_hiddens[0] = flatten_rnn_hiddens # compute imaginated trajectory using action from action_model for h in range(1, args.imagination_horizon + 1): actions = action_model(flatten_states, flatten_rnn_hiddens) flatten_states_prior, flatten_rnn_hiddens = rssm.prior(flatten_states, actions, flatten_rnn_hiddens) flatten_states = flatten_states_prior.rsample() imaginated_states[h] = flatten_states imaginated_rnn_hiddens[h] = flatten_rnn_hiddens # compute rewards and values corresponding to imaginated states and rnn_hiddens flatten_imaginated_states = imaginated_states.view(-1, args.state_dim) flatten_imaginated_rnn_hiddens = imaginated_rnn_hiddens.view(-1, args.rnn_hidden_dim) imaginated_rewards = \ reward_model(flatten_imaginated_states, flatten_imaginated_rnn_hiddens).view(args.imagination_horizon + 1, -1) imaginated_values = \ value_model(flatten_imaginated_states, flatten_imaginated_rnn_hiddens).view(args.imagination_horizon + 1, -1) # compute lambda target lambda_target_values = lambda_target(imaginated_rewards, imaginated_values, args.gamma, args.lambda_) # update_value model value_loss = 0.5 * mse_loss(imaginated_values, lambda_target_values.detach()) value_optimizer.zero_grad() value_loss.backward(retain_graph=True) clip_grad_norm_(value_model.parameters(), args.clip_grad_norm) value_optimizer.step() # update action model (multiply -1 for gradient ascent) action_loss = -1 * (lambda_target_values.mean()) action_optimizer.zero_grad() action_loss.backward() clip_grad_norm_(action_model.parameters(), args.clip_grad_norm) action_optimizer.step() # print losses and add to tensorboard print('update_step: %3d model loss: %.5f, kl_loss: %.5f, ' 'obs_loss: %.5f, reward_loss: %.5f, ' 'value_loss: %.5f action_loss: %.5f' % (update_step + 1, model_loss.item(), kl_loss.item(), obs_loss.item(), reward_loss.item(), value_loss.item(), action_loss.item())) total_update_step = episode * args.collect_interval + update_step writer.add_scalar('model loss', model_loss.item(), total_update_step) writer.add_scalar('kl loss', kl_loss.item(), total_update_step) writer.add_scalar('obs loss', obs_loss.item(), total_update_step) writer.add_scalar('reward loss', reward_loss.item(), total_update_step) writer.add_scalar('value loss', value_loss.item(), total_update_step) writer.add_scalar('action loss', action_loss.item(), total_update_step) print('elasped time for update: %.2fs' % (time.time() - start)) # ---------------------------------------------- # evaluation without exploration noise # ---------------------------------------------- if (episode + 1) % args.test_interval == 0: policy = Agent(encoder, rssm, action_model) start = time.time() obs = env.reset() done = False total_reward = 0 while not done: action = policy(obs, training=False) obs, reward, done, _ = env.step(action) total_reward += reward writer.add_scalar('total reward at test', total_reward, episode) print('Total test reward at episode [%4d/%4d] is %f' % (episode+1, args.all_episodes, total_reward)) print('elasped time for test: %.2fs' % (time.time() - start)) # save learned model parameters torch.save(encoder.state_dict(), os.path.join(log_dir, 'encoder.pth')) torch.save(rssm.state_dict(), os.path.join(log_dir, 'rssm.pth')) torch.save(obs_model.state_dict(), os.path.join(log_dir, 'obs_model.pth')) torch.save(reward_model.state_dict(), os.path.join(log_dir, 'reward_model.pth')) torch.save(value_model.state_dict(), os.path.join(log_dir, 'value_model.pth')) torch.save(action_model.state_dict(), os.path.join(log_dir, 'action_model.pth')) writer.close()
def main(): parser = argparse.ArgumentParser(description='PlaNet for DM control') parser.add_argument('--seed', type=int, default=0) parser.add_argument('--log-dir', type=str, default='log') parser.add_argument('--test-interval', type=int, default=10) parser.add_argument('--domain-name', type=str, default='cheetah') parser.add_argument('--task-name', type=str, default='run') parser.add_argument('-R', '--action-repeat', type=int, default=4) parser.add_argument('--state-dim', type=int, default=30) parser.add_argument('--rnn-hidden-dim', type=int, default=200) parser.add_argument('--buffer-capacity', type=int, default=1000000) parser.add_argument('--all-episodes', type=int, default=1000) parser.add_argument('-S', '--seed-episodes', type=int, default=5) parser.add_argument('-C', '--collect-interval', type=int, default=100) parser.add_argument('-B', '--batch-size', type=int, default=50) parser.add_argument('-L', '--chunk-length', type=int, default=50) parser.add_argument('--lr', type=float, default=1e-3) parser.add_argument('--eps', type=float, default=1e-4) parser.add_argument('--clip-grad-norm', type=int, default=1000) parser.add_argument('--free-nats', type=int, default=3) parser.add_argument('-H', '--horizon', type=int, default=12) parser.add_argument('-I', '--N-iterations', type=int, default=10) parser.add_argument('-J', '--N-candidates', type=int, default=1000) parser.add_argument('-K', '--N-top-candidates', type=int, default=100) parser.add_argument('--action-noise-var', type=float, default=0.3) args = parser.parse_args() # Prepare logging log_dir = os.path.join(args.log_dir, args.domain_name + '_' + args.task_name) log_dir = os.path.join(log_dir, datetime.now().strftime('%Y%m%d_%H%M')) os.makedirs(log_dir) with open(os.path.join(log_dir, 'args.json'), 'w') as f: json.dump(vars(args), f) pprint(vars(args)) writer = SummaryWriter(log_dir=log_dir) # set seed (NOTE: some randomness is still remaining (e.g. cuDNN's randomness)) np.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed(args.seed) # define env and apply wrappers env = suite.load(args.domain_name, args.task_name, task_kwargs={'random': args.seed}) env = pixels.Wrapper(env, render_kwargs={ 'height': 64, 'width': 64, 'camera_id': 0 }) env = GymWrapper(env) env = RepeatAction(env, skip=args.action_repeat) # define replay buffer replay_buffer = ReplayBuffer(capacity=args.buffer_capacity, observation_shape=env.observation_space.shape, action_dim=env.action_space.shape[0]) # define models and optimizer device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') encoder = Encoder().to(device) rssm = RecurrentStateSpaceModel(args.state_dim, env.action_space.shape[0], args.rnn_hidden_dim).to(device) obs_model = ObservationModel(args.state_dim, args.rnn_hidden_dim).to(device) reward_model = RewardModel(args.state_dim, args.rnn_hidden_dim).to(device) all_params = (list(encoder.parameters()) + list(rssm.parameters()) + list(obs_model.parameters()) + list(reward_model.parameters())) optimizer = Adam(all_params, lr=args.lr, eps=args.eps) # collect initial experience with random action for episode in range(args.seed_episodes): obs = env.reset() done = False while not done: action = env.action_space.sample() next_obs, reward, done, _ = env.step(action) replay_buffer.push(obs, action, reward, done) obs = next_obs # main training loop for episode in range(args.seed_episodes, args.all_episodes): # collect experiences start = time.time() cem_agent = CEMAgent(encoder, rssm, reward_model, args.horizon, args.N_iterations, args.N_candidates, args.N_top_candidates) obs = env.reset() done = False total_reward = 0 while not done: action = cem_agent(obs) action += np.random.normal(0, np.sqrt(args.action_noise_var), env.action_space.shape[0]) next_obs, reward, done, _ = env.step(action) replay_buffer.push(obs, action, reward, done) obs = next_obs total_reward += reward writer.add_scalar('total reward at train', total_reward, episode) print('episode [%4d/%4d] is collected. Total reward is %f' % (episode + 1, args.all_episodes, total_reward)) print('elasped time for interaction: %.2fs' % (time.time() - start)) # update model parameters start = time.time() for update_step in range(args.collect_interval): observations, actions, rewards, _ = \ replay_buffer.sample(args.batch_size, args.chunk_length) # preprocess observations and transpose tensor for RNN training observations = preprocess_obs(observations) observations = torch.as_tensor(observations, device=device) observations = observations.transpose(3, 4).transpose(2, 3) observations = observations.transpose(0, 1) actions = torch.as_tensor(actions, device=device).transpose(0, 1) rewards = torch.as_tensor(rewards, device=device).transpose(0, 1) # embed observations with CNN embedded_observations = encoder(observations.reshape( -1, 3, 64, 64)).view(args.chunk_length, args.batch_size, -1) # prepare Tensor to maintain states sequence and rnn hidden states sequence states = torch.zeros(args.chunk_length, args.batch_size, args.state_dim, device=device) rnn_hiddens = torch.zeros(args.chunk_length, args.batch_size, args.rnn_hidden_dim, device=device) # initialize state and rnn hidden state with 0 vector state = torch.zeros(args.batch_size, args.state_dim, device=device) rnn_hidden = torch.zeros(args.batch_size, args.rnn_hidden_dim, device=device) # compute state and rnn hidden sequences and kl loss kl_loss = 0 for l in range(args.chunk_length - 1): next_state_prior, next_state_posterior, rnn_hidden = \ rssm(state, actions[l], rnn_hidden, embedded_observations[l+1]) state = next_state_posterior.rsample() states[l + 1] = state rnn_hiddens[l + 1] = rnn_hidden kl = kl_divergence(next_state_prior, next_state_posterior).sum(dim=1) kl_loss += kl.clamp(min=args.free_nats).mean() kl_loss /= (args.chunk_length - 1) # compute reconstructed observations and predicted rewards flatten_states = states.view(-1, args.state_dim) flatten_rnn_hiddens = rnn_hiddens.view(-1, args.rnn_hidden_dim) recon_observations = obs_model(flatten_states, flatten_rnn_hiddens).view( args.chunk_length, args.batch_size, 3, 64, 64) predicted_rewards = reward_model(flatten_states, flatten_rnn_hiddens).view( args.chunk_length, args.batch_size, 1) # compute loss for observation and reward obs_loss = 0.5 * mse_loss(recon_observations[1:], observations[1:], reduction='none').mean([0, 1]).sum() reward_loss = 0.5 * mse_loss(predicted_rewards[1:], rewards[:-1]) # add all losses and update model parameters with gradient descent loss = kl_loss + obs_loss + reward_loss optimizer.zero_grad() loss.backward() clip_grad_norm_(all_params, args.clip_grad_norm) optimizer.step() # print losses and add tensorboard print( 'update_step: %3d loss: %.5f, kl_loss: %.5f, obs_loss: %.5f, reward_loss: % .5f' % (update_step + 1, loss.item(), kl_loss.item(), obs_loss.item(), reward_loss.item())) total_update_step = episode * args.collect_interval + update_step writer.add_scalar('overall loss', loss.item(), total_update_step) writer.add_scalar('kl loss', kl_loss.item(), total_update_step) writer.add_scalar('obs loss', obs_loss.item(), total_update_step) writer.add_scalar('reward loss', reward_loss.item(), total_update_step) print('elasped time for update: %.2fs' % (time.time() - start)) # test to get score without exploration noise if (episode + 1) % args.test_interval == 0: start = time.time() cem_agent = CEMAgent(encoder, rssm, reward_model, args.horizon, args.N_iterations, args.N_candidates, args.N_top_candidates) obs = env.reset() done = False total_reward = 0 while not done: action = cem_agent(obs) obs, reward, done, _ = env.step(action) total_reward += reward writer.add_scalar('total reward at test', total_reward, episode) print('Total test reward at episode [%4d/%4d] is %f' % (episode + 1, args.all_episodes, total_reward)) print('elasped time for test: %.2fs' % (time.time() - start)) # save learned model parameters torch.save(encoder.state_dict(), os.path.join(log_dir, 'encoder.pth')) torch.save(rssm.state_dict(), os.path.join(log_dir, 'rssm.pth')) torch.save(obs_model.state_dict(), os.path.join(log_dir, 'obs_model.pth')) torch.save(reward_model.state_dict(), os.path.join(log_dir, 'reward_model.pth')) writer.close()
from dm_control import suite from dm_control.suite.wrappers import pixels import numpy as np DOMAIN_NAME = "acrobot" TASK_NAME = "swingup" # Load one task: env = suite.load(domain_name=DOMAIN_NAME, task_name=TASK_NAME) # Wrap the environment to obtain the pixels env = pixels.Wrapper(env, pixels_only=False) # Step through an episode and print out reward, discount and observation. action_spec = env.action_spec() time_step = env.reset() while not time_step.last(): action = np.random.uniform(action_spec.minimum, action_spec.maximum, size=action_spec.shape) time_step = env.step(action) observation_dm = time_step.observation["pixels"] print(observation_dm)
def load(domain_name, task_name, difficulty=None, dynamic=False, background_dataset_path=None, background_dataset_videos="train", background_kwargs=None, camera_kwargs=None, color_kwargs=None, task_kwargs=None, environment_kwargs=None, visualize_reward=False, render_kwargs=None, pixels_only=True, pixels_observation_key="pixels", env_state_wrappers=None): """Returns an environment from a domain name, task name and optional settings. ```python env = suite.load('cartpole', 'balance') ``` Adding a difficulty will configure distractions matching the reference paper for easy, medium, hard. Users can also toggle dynamic properties for distractions. Args: domain_name: A string containing the name of a domain. task_name: A string containing the name of a task. difficulty: Difficulty for the suite. One of 'easy', 'medium', 'hard'. dynamic: Boolean controlling whether distractions are dynamic or static. background_dataset_path: String to the davis directory that contains the video directories. background_dataset_videos: String ('train'/'val') or list of strings of the DAVIS videos to be used for backgrounds. background_kwargs: Dict, overwrites settings for background distractions. camera_kwargs: Dict, overwrites settings for camera distractions. color_kwargs: Dict, overwrites settings for color distractions. task_kwargs: Dict, dm control task kwargs. environment_kwargs: Optional `dict` specifying keyword arguments for the environment. visualize_reward: Optional `bool`. If `True`, object colours in rendered frames are set to indicate the reward at each step. Default `False`. render_kwargs: Dict, render kwargs for pixel wrapper. pixels_only: Boolean controlling the exclusion of states in the observation. pixels_observation_key: Key in the observation used for the rendered image. env_state_wrappers: Env state wrappers to be called before the PixelWrapper. Returns: The requested environment. """ if not is_available(): raise ImportError("dm_control module is not available. Make sure you " "follow the installation instructions from the " "dm_control package.") if difficulty not in [None, "easy", "medium", "hard"]: raise ValueError( "Difficulty should be one of: 'easy', 'medium', 'hard'.") render_kwargs = render_kwargs or {} if "camera_id" not in render_kwargs: render_kwargs["camera_id"] = 2 if domain_name == "quadruped" else 0 env = suite.load(domain_name, task_name, task_kwargs=task_kwargs, environment_kwargs=environment_kwargs, visualize_reward=visualize_reward) # Apply background distractions. if difficulty or background_kwargs: background_dataset_path = (background_dataset_path or suite_utils.DEFAULT_BACKGROUND_PATH) final_background_kwargs = dict() if difficulty: # Get kwargs for the given difficulty. num_videos = suite_utils.DIFFICULTY_NUM_VIDEOS[difficulty] final_background_kwargs.update( suite_utils.get_background_kwargs(domain_name, num_videos, dynamic, background_dataset_path, background_dataset_videos)) else: # Set the dataset path and the videos. final_background_kwargs.update( dict(dataset_path=background_dataset_path, dataset_videos=background_dataset_videos)) if background_kwargs: # Overwrite kwargs with those passed here. final_background_kwargs.update(background_kwargs) env = background.DistractingBackgroundEnv(env, **final_background_kwargs) # Apply camera distractions. if difficulty or camera_kwargs: final_camera_kwargs = dict(camera_id=render_kwargs["camera_id"]) if difficulty: # Get kwargs for the given difficulty. scale = suite_utils.DIFFICULTY_SCALE[difficulty] final_camera_kwargs.update( suite_utils.get_camera_kwargs(domain_name, scale, dynamic)) if camera_kwargs: # Overwrite kwargs with those passed here. final_camera_kwargs.update(camera_kwargs) env = camera.DistractingCameraEnv(env, **final_camera_kwargs) # Apply color distractions. if difficulty or color_kwargs: final_color_kwargs = dict() if difficulty: # Get kwargs for the given difficulty. scale = suite_utils.DIFFICULTY_SCALE[difficulty] final_color_kwargs.update( suite_utils.get_color_kwargs(scale, dynamic)) if color_kwargs: # Overwrite kwargs with those passed here. final_color_kwargs.update(color_kwargs) env = color.DistractingColorEnv(env, **final_color_kwargs) if env_state_wrappers is not None: for wrapper in env_state_wrappers: env = wrapper(env) # Apply Pixel wrapper after distractions. This is needed to ensure the # changes from the distraction wrapper are applied to the MuJoCo environment # before the rendering occurs. env = pixels.Wrapper(env, pixels_only=pixels_only, render_kwargs=render_kwargs, observation_key=pixels_observation_key) return env
def main(): parser = argparse.ArgumentParser( description='Open-loop video prediction with learned model') parser.add_argument('dir', type=str, help='log directory to load learned model') parser.add_argument('--length', type=int, default=50, help='the length of video prediction') parser.add_argument('--domain-name', type=str, default='cheetah') parser.add_argument('--task-name', type=str, default='run') parser.add_argument('-R', '--action-repeat', type=int, default=2) args = parser.parse_args() # define environment and apply wrapper env = suite.load(args.domain_name, args.task_name) env = pixels.Wrapper(env, render_kwargs={ 'height': 64, 'width': 64, 'camera_id': 0 }) env = GymWrapper(env) env = RepeatAction(env, skip=args.action_repeat) # define models with open(os.path.join(args.dir, 'args.json'), 'r') as f: train_args = json.load(f) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') encoder = Encoder().to(device) rssm = RecurrentStateSpaceModel(train_args['state_dim'], env.action_space.shape[0], train_args['rnn_hidden_dim']).to(device) obs_model = ObservationModel(train_args['state_dim'], train_args['rnn_hidden_dim']).to(device) action_model = ActionModel(train_args['state_dim'], train_args['rnn_hidden_dim'], env.action_space.shape[0]).to(device) # load learned parameters encoder.load_state_dict(torch.load(os.path.join(args.dir, 'encoder.pth'))) rssm.load_state_dict(torch.load(os.path.join(args.dir, 'rssm.pth'))) obs_model.load_state_dict( torch.load(os.path.join(args.dir, 'obs_model.pth'))) action_model.load_state_dict( torch.load(os.path.join(args.dir, 'action_model.pth'))) # define agent policy = Agent(encoder, rssm, action_model) # open-loop video prediction # select starting point of open-loop prediction randomly starting_point = torch.randint(1000 // args.action_repeat - args.length, (1, )).item() # interact in environment until starting point and charge context in policy.rnn_hidden obs = env.reset() for _ in range(starting_point): action = policy(obs) obs, _, _, _ = env.step(action) # preprocess observatin and embed by encoder preprocessed_obs = preprocess_obs(obs) preprocessed_obs = torch.as_tensor(preprocessed_obs, device=device) preprocessed_obs = preprocessed_obs.transpose(1, 2).transpose(0, 1).unsqueeze(0) with torch.no_grad(): embedded_obs = encoder(preprocessed_obs) # compute state using embedded observation # NOTE: after this, state is updated only using prior, # it means model doesn't see observation rnn_hidden = policy.rnn_hidden state = rssm.posterior(rnn_hidden, embedded_obs).sample() frame = np.zeros((64, 128, 3)) frames = [] for _ in range(args.length): # action is selected same as training time (closed-loop) action = policy(obs) obs, _, _, _ = env.step(action) # update state and reconstruct observation with same action action = torch.as_tensor(action, device=device).unsqueeze(0) with torch.no_grad(): state_prior, rnn_hidden = rssm.prior(state, action, rnn_hidden) state = state_prior.sample() predicted_obs = obs_model(state, rnn_hidden) # arrange GT frame and predicted frame in parallel frame[:, :64, :] = preprocess_obs(obs) frame[:, 64:, :] = predicted_obs.squeeze().transpose(0, 1).transpose( 1, 2).cpu().numpy() frames.append((frame + 0.5).clip(0.0, 1.0)) save_video_as_gif(frames)
def __init__( self, level: LevelSelection, frame_skip: int, visualization_parameters: VisualizationParameters, target_success_rate: float = 1.0, seed: Union[None, int] = None, human_control: bool = False, observation_type: ObservationType = ObservationType.Measurements, custom_reward_threshold: Union[int, float] = None, **kwargs): """ :param level: (str) A string representing the control suite level to run. This can also be a LevelSelection object. For example, cartpole:swingup. :param frame_skip: (int) The number of frames to skip between any two actions given by the agent. The action will be repeated for all the skipped frames. :param visualization_parameters: (VisualizationParameters) The parameters used for visualizing the environment, such as the render flag, storing videos etc. :param target_success_rate: (float) Stop experiment if given target success rate was achieved. :param seed: (int) A seed to use for the random number generator when running the environment. :param human_control: (bool) A flag that allows controlling the environment using the keyboard keys. :param observation_type: (ObservationType) An enum which defines which observation to use. The current options are to use: * Measurements only - a vector of joint torques and similar measurements * Image only - an image of the environment as seen by a camera attached to the simulator * Measurements & Image - both type of observations will be returned in the state using the keys 'measurements' and 'pixels' respectively. :param custom_reward_threshold: (float) Allows defining a custom reward that will be used to decide when the agent succeeded in passing the environment. """ super().__init__(level, seed, frame_skip, human_control, custom_reward_threshold, visualization_parameters, target_success_rate) self.observation_type = observation_type # load and initialize environment domain_name, task_name = self.env_id.split(":") self.env = suite.load(domain_name=domain_name, task_name=task_name, task_kwargs={'random': seed}) if observation_type != ObservationType.Measurements: self.env = pixels.Wrapper( self.env, pixels_only=observation_type == ObservationType.Image) # seed if self.seed is not None: np.random.seed(self.seed) random.seed(self.seed) self.state_space = StateSpace({}) # image observations if observation_type != ObservationType.Measurements: self.state_space['pixels'] = ImageObservationSpace( shape=self.env.observation_spec()['pixels'].shape, high=255) # measurements observations if observation_type != ObservationType.Image: measurements_space_size = 0 measurements_names = [] for observation_space_name, observation_space in self.env.observation_spec( ).items(): if len(observation_space.shape) == 0: measurements_space_size += 1 measurements_names.append(observation_space_name) elif len(observation_space.shape) == 1: measurements_space_size += observation_space.shape[0] measurements_names.extend([ "{}_{}".format(observation_space_name, i) for i in range(observation_space.shape[0]) ]) self.state_space['measurements'] = VectorObservationSpace( shape=measurements_space_size, measurements_names=measurements_names) # actions self.action_space = BoxActionSpace( shape=self.env.action_spec().shape[0], low=self.env.action_spec().minimum, high=self.env.action_spec().maximum) # initialize the state by getting a new state from the environment self.reset_internal_state(True) # render if self.is_rendered: image = self.get_rendered_image() scale = 1 if self.human_control: scale = 2 if not self.native_rendering: self.renderer.create_screen(image.shape[1] * scale, image.shape[0] * scale) self.target_success_rate = target_success_rate
def __init__(self, domain_name, task_name, horizon=None, gamma=0.99, task_kwargs=None, dt=.01, width_screen=480, height_screen=480, camera_id=0, use_pixels=False, pixels_width=64, pixels_height=64): """ Constructor. Args: domain_name (str): name of the environment; task_name (str): name of the task of the environment; horizon (int): the horizon; gamma (float): the discount factor; task_kwargs (dict, None): parameters of the task; dt (float, .01): duration of a control step; width_screen (int, 480): width of the screen; height_screen (int, 480): height of the screen; camera_id (int, 0): position of camera to render the environment; use_pixels (bool, False): if True, pixel observations are used rather than the state vector; pixels_width (int, 64): width of the pixel observation; pixels_height (int, 64): height of the pixel observation; """ # MDP creation self.env = suite.load(domain_name, task_name, task_kwargs=task_kwargs) if use_pixels: self.env = pixels.Wrapper(self.env, render_kwargs={ 'width': pixels_width, 'height': pixels_height }) # get the default horizon if horizon is None: horizon = self.env._step_limit # Hack to ignore dm_control time limit. self.env._step_limit = np.inf if use_pixels: self._convert_observation_space = self._convert_observation_space_pixels self._convert_observation = self._convert_observation_pixels else: self._convert_observation_space = self._convert_observation_space_vector self._convert_observation = self._convert_observation_vector # MDP properties action_space = self._convert_action_space(self.env.action_spec()) observation_space = self._convert_observation_space( self.env.observation_spec()) mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) self._viewer = ImageViewer((width_screen, height_screen), dt) self._camera_id = camera_id super().__init__(mdp_info) self._state = None
def __init__( self, level: LevelSelection, frame_skip: int, visualization_parameters: VisualizationParameters, seed: Union[None, int] = None, human_control: bool = False, observation_type: ObservationType = ObservationType.Measurements, custom_reward_threshold: Union[int, float] = None, **kwargs): super().__init__(level, seed, frame_skip, human_control, custom_reward_threshold, visualization_parameters) self.observation_type = observation_type # load and initialize environment domain_name, task_name = self.env_id.split(":") self.env = suite.load(domain_name=domain_name, task_name=task_name, task_kwargs={'random': seed}) if observation_type != ObservationType.Measurements: self.env = pixels.Wrapper( self.env, pixels_only=observation_type == ObservationType.Image) # seed if self.seed is not None: np.random.seed(self.seed) random.seed(self.seed) self.state_space = StateSpace({}) # image observations if observation_type != ObservationType.Measurements: self.state_space['pixels'] = ImageObservationSpace( shape=self.env.observation_spec()['pixels'].shape, high=255) # measurements observations if observation_type != ObservationType.Image: measurements_space_size = 0 measurements_names = [] for observation_space_name, observation_space in self.env.observation_spec( ).items(): if len(observation_space.shape) == 0: measurements_space_size += 1 measurements_names.append(observation_space_name) elif len(observation_space.shape) == 1: measurements_space_size += observation_space.shape[0] measurements_names.extend([ "{}_{}".format(observation_space_name, i) for i in range(observation_space.shape[0]) ]) self.state_space['measurements'] = VectorObservationSpace( shape=measurements_space_size, measurements_names=measurements_names) # actions self.action_space = BoxActionSpace( shape=self.env.action_spec().shape[0], low=self.env.action_spec().minimum, high=self.env.action_spec().maximum) # initialize the state by getting a new state from the environment self.reset_internal_state(True) # render if self.is_rendered: image = self.get_rendered_image() scale = 1 if self.human_control: scale = 2 if not self.native_rendering: self.renderer.create_screen(image.shape[1] * scale, image.shape[0] * scale)