def main(argv): del argv environment_name = FLAGS.environment_name if environment_name is None: print('\n '.join(['Available environments:'] + _ALL_NAMES)) environment_name = prompt_environment_name( 'Please select an environment name: ', _ALL_NAMES) index = _ALL_NAMES.index(environment_name) domain_name, task_name = (suite.ALL_TASKS + custom_suite.ALL_TASKS)[index] task_kwargs = {} if not FLAGS.timeout: task_kwargs['time_limit'] = float('inf') def loader(): try: env = suite.load(domain_name=domain_name, task_name=task_name, task_kwargs=task_kwargs) except ValueError: task_kwargs['params'] = [0.25, 1.0, 8] env = custom_suite.load(domain_name=domain_name, task_name=task_name, task_kwargs=task_kwargs) env.task.visualize_reward = FLAGS.visualize_reward if FLAGS.action_noise > 0: env = action_noise.Wrapper(env, scale=FLAGS.action_noise) return env viewer.launch(loader)
def main(unused_argv): # The viewer calls the environment_loader on episode resets. However the task # cycles through one clip per episode. To avoid replaying the first clip again # and again we construct the environment outside the viewer to make it # persistent across resets. env = mocap_playback_env() viewer.launch(environment_loader=lambda: env)
def main(_): if FLAGS.suite == 'dm_control': logging.info('Loading from dm_control...') env = suite.load(domain_name=FLAGS.domain_name, task_name=FLAGS.task_name) elif FLAGS.suite == 'rwrl': logging.info('Loading from rwrl...') env = rwrl.load(domain_name=FLAGS.domain_name, task_name=FLAGS.task_name) random_policy = RandomAgent(env.action_spec()).action viewer.launch(env, policy=random_policy)
def main(argv): if len(argv) > 1: raise app.UsageError("Too many command-line arguments.") viewer.launch(environment_loader=functools.partial( soccer.load, team_size=2, walker_type=soccer.WalkerType[FLAGS.walker_type], disable_walker_contacts=FLAGS.disable_walker_contacts, enable_field_box=FLAGS.enable_field_box, keep_aspect_ratio=True, terminate_on_goal=FLAGS.terminate_on_goal))
def test_interact(self, model_path, random=False): """load trained parameters""" if not random: self._actor.load_state_dict(torch.load(model_path)) if self.benchmark == "dm_control": if random: def random_policy(time_step): del time_step # Unused. return np.random.uniform( low=self._env.action_spec().minimum, high=self._env.action_spec().maximum, size=self._env.action_spec().shape) viewer.launch(self._env, policy=random_policy) else: def source_policy(time_step): s = None for k, v in time_step.observation.items(): if s is None: s = v.flatten() else: s = np.hstack([s, v]) s_3d = np.reshape(s, [1, self.state_dim]) mu, std = self._actor(torch.Tensor(s_3d).to(self.dev)) action = self._actor.get_action(mu, std) return action viewer.launch(self._env, policy=source_policy) elif self.benchmark == "gym": for ep in range(self.test_iter): score = 0 done = False state = self._env.reset() state = np.reshape(state, [1, self.state_dim]) while not done: mu, std = self._actor(torch.Tensor(state).to(self.dev)) action = self._actor.get_action(mu, std) if random: next_state, reward, done, info = self._env.step( np.random.randn(self.action_dim)) else: next_state, reward, done, info = self._env.step(action) self._env.render() score = self.gamma * score + reward next_state = np.reshape(next_state, [1, self.state_dim]) state = next_state print(f"test iter : {ep}\tscore : {score}")
def make_env(self, args=None, kwargs=None, dm_task_name=None): """Create dm_control/metaworld environment""" if self.metaworld_env: env = mtw_envs_rand[self.env_name](*args, **kwargs) if debug_mode: env._max_episode_steps = 10000 env.reset() env.render() global action_to_take glfw.set_key_callback(env.unwrapped.viewer.window, on_press) while True: env.render() if not np.array_equal(action_to_take, np.zeros(6)): _, _, d, _ = env.step(action_to_take) if d: env.seed(args.seed) env.reset() env.render() # Commenting this out makes the mocap faster but # introduces some instabilities. # action_to_take = np.zeros(6) else: camera_id = 2 if self.domain_name == 'quadruped' else 0 if dm_task_name is not None: task_name = dm_task_name else: task_name = self.task_name env = dmc2gym.make(domain_name=self.domain_name, task_name=task_name, seed=self.cfg.seed, visualize_reward=False, from_pixels=False, height=self.cfg.image_size, width=self.cfg.image_size, frame_skip=self.cfg.action_repeat, camera_id=camera_id) if debug_mode: from dm_control import viewer viewer.launch(env) env = FrameStack(env, k=self.cfg.frame_stack) env.seed(self.cfg.seed) return env
def main(argv): del argv environment_name = FLAGS.environment_name all_names = list(manipulation.ALL) if environment_name is None: print('\n '.join(['Available environments:'] + all_names)) environment_name = prompt_environment_name( 'Please select an environment name: ', all_names) loader = functools.partial( manipulation.load, environment_name=environment_name) viewer.launch(loader)
def build_and_test(model_path, config_key): import dmc_wrapper from dm_control import viewer from rlpyt.utils.buffer import buffer_from_example, torchify_buffer, numpify_buffer import torch config = configs[config_key] reloaded = torch.load(model_path) if len(model_path) > 0 else None # import pdb; pdb.set_trace() agent = MultiFfAgent(model_kwargs=config["model"], initial_model_state_dict=reloaded['agent_state_dict'], **config["agent"]) dm_env = maw.load(team_size=args.team_size, time_limit=args.time_limit, terrain=not args.no_hfield, agent_type=args.agent_type, deterministic_spawn=not args.random_spawn, raise_exception_on_physics_error=False, task_id=args.task_id) env = GymEnvWrapper(dmc2gym.DmControlWrapper('', '', env=dm_env)) agent.initialize(env.spaces) agent.reset() # agent.eval_mode(0) prev_action = env.action_space.null_value() def get_prev_action(): return prev_action def policy(time_step): obs = dmc_wrapper.convertObservation(time_step.observation) reward = time_step.reward reward = np.asarray(reward) if reward is not None else reward obs_pyt, act_pyt, rew_pyt = torchify_buffer( (obs, get_prev_action(), reward)) # obs_pyt, rew_pyt = torchify_buffer((obs, reward)) act_pyt, agent_info = agent.step(obs_pyt.float(), act_pyt, rew_pyt) # prev_action = act_pyt return act_pyt viewer.launch(dm_env, policy=policy)
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') environment_name = FLAGS.environment_name if environment_name == 'mujoban': walker = walkers.JumpingBallWithHead(add_ears=True, camera_height=0.25) arena = MujobanLevel(boxoban_level_generator) task = Mujoban(walker=walker, maze=arena, control_timestep=CONTROL_TIMESTEP, top_camera_height=64, top_camera_width=48) env = composer.Environment(time_limit=TIME_LIMIT, task=task, strip_singleton_obs_buffer_dim=True) else: env = functools.partial(board_games.load, environment_name=environment_name) viewer.launch(env)
def main(argv): del argv environment_name = FLAGS.environment_name if environment_name is None: print('\n '.join(['Available environments:'] + _ALL_NAMES)) environment_name = prompt_environment_name( 'Please select an environment name: ', _ALL_NAMES) index = _ALL_NAMES.index(environment_name.lower()) domain_name, task_name = suite.ALL_TASKS[index] task_kwargs = {} if not FLAGS.timeout: task_kwargs['time_limit'] = float('inf') def loader(): env = suite.load( domain_name=domain_name, task_name=task_name, task_kwargs=task_kwargs) env.task.visualize_reward = FLAGS.visualize_reward return env viewer.launch(loader)
def __init__(self, seed, difficulty="easy", render=False): self.seed = seed self.env_name = "reacher" self.env = suite.load(self.env_name, difficulty, visualize_reward=True, task_kwargs={"random": seed}) self.render = render # Debug logs self.committed_actions = [] self.time_step = 0 if render: viewer.launch(self.env) MDP.__init__(self, range(self.env.action_spec().minimum.shape[0]), self._transition_func, self._reward_func, init_state=FixedReacherState( self.env.reset().observation))
def visualize_trajectory(argv): env = build_env( reward_type=FLAGS.reward_type, ghost_offset=1, clip_name=FLAGS.clip_name, start_step=FLAGS.start_step, ) actions = np.load(FLAGS.load_actions_path) analyze_trajectory(env, actions) def policy(time_step): global step if time_step.first(): step = 0 else: step += 1 if step < len(actions): return actions[step] else: print('{} Out of actions - returning zeros'.format(step)) return np.zeros_like(actions[0]) viewer.launch(env, policy=policy)
state_dict = torch.load(dump, map_location="cpu") policy = Actor(*state_dict["args"].tolist()) policy.load_state_dict(state_dict) policy.eval() @torch.no_grad() def _policy(time_step): state = np.concatenate(list(time_step.observation.values())) state_tensor = torch.tensor(state, dtype=torch.float) p = policy(state_tensor).numpy() return np.clip(p, action_spec.minimum, action_spec.maximum) return _policy if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--dump", type=str, default="dumps/model.pth") parser.add_argument("--env", nargs=2, type=str, default=["cartpole", "swingup"]) args = parser.parse_args() env = suite.load(domain_name=args.env[0], task_name=args.env[1]) action_spec = env.action_spec() policy = get_policy(args.dump, action_spec) viewer.launch(env, policy)
def main(): env = suite.load(domain_name="quadruped", task_name="escape") viewer.launch(environment_loader=env)
def main(): # viewer.launch(environment_loader=ant_run) # viewer.launch(environment_loader=ant_run_long) # viewer.launch(environment_loader=ant_run_walls) viewer.launch(environment_loader=ant_run_gaps)
from dm_control import suite from dm_control import viewer import numpy as np # Load one task: env = suite.load(domain_name="quadruped", task_name="fetch") # Iterate over a task set: for domain_name, task_name in suite.BENCHMARKING: env = suite.load(domain_name, task_name) #viewer.launch(env) # Step through an episode and print out reward, discount and observation. action_spec = env.action_spec() time_step = env.reset() #while not time_step.last(): #viewer.launch(env) all_actions=[] while not time_step.last(): action = np.random.uniform(action_spec.minimum, action_spec.maximum, size=action_spec.shape) time_step = env.step(action) print(time_step.reward, time_step.discount, time_step.observation) all_actions.append(action) viewer.launch(env,all_actions) viewer.launch(env) #viewer.render()
# Define a linear control policy. def linear_control_policy(time_step): # State Variables x_dot = time_step.observation['velocity'][0] theta_dot = time_step.observation['velocity'][1] x = time_step.observation['position'][0] theta = np.arccos(time_step.observation['position'][1]) # Calculate Control Input x_vec = np.array([[theta], [theta_dot], [x], [x_dot]]) u = np.matmul(-np.transpose(K), x_vec) # Apply Control Input time_step = env.step(u) return u # Launch the viewer application. viewer.launch(env, policy=linear_control_policy) # Save K with open(f"K_{int(time.time())}.pickle", "wb") as f: pickle.dump(K, f) # Plotting plt.plot([i for i in range(num_ep)], ep_rewards) plt.ylabel(f"Episode Rewards") plt.xlabel(f"Episode #") plt.show()
def random_policy(time_step=None): del time_step # Unused. # print(env.action_spec().minimum,env.action_spec().maximum,env.action_spec().shape) lo = -0.3 * np.ones((2, 1)) hig = 0.3 * np.ones((2, 1)) return np.random.uniform(low=lo, high=hig, size=(2, 1)) if __name__ == '__main__': env = RopeEnv(use_visual_observation=False, use_image_goal=False, n_substeps=20) viewer.launch(ViewerWrapper(env), policy=random_policy) # env.reset() # print(env.physics.named.model.body_pos) # print(env.goal_state) # print(env.physics.get_state()) # action = random_policy().squeeze() action = np.ones((2, )) * 100 while True: # print(env.physics.data.ctrl) pixels, _, _, _ = env.step(action.squeeze()) pixels = pixels['observation'] pixels = pixels / 255.0 pixels = pixels[:, :, ::-1] # BGR to RGB print(action)
# Get new Discrete Step new_discrete_state = get_discrete_state(observations) if time_step.discount is None: done = True if not done: max_future_q = np.max(q_table[new_discrete_state]) current_q = q_table[discrete_state + (action, )] new_q = (1 - Learning_Rate) * current_q + Learning_Rate * ( time_step.reward + Discount * max_future_q) q_table[discrete_state + (action, )] = new_q discrete_state = new_discrete_state # Print the Results of the Action print("reward = {}, discount = {}, observations = {}.".format( time_step.reward, time_step.discount, time_step.observation)) return action plt.plot(aggr_ep_rewards['ep'], aggr_ep_rewards['avg'], Label="avg") plt.plot(aggr_ep_rewards['ep'], aggr_ep_rewards['min'], Label="min") plt.plot(aggr_ep_rewards['ep'], aggr_ep_rewards['max'], Label="max") plt.legend(loc=4) plt.show() # Launch the viewer application. viewer.launch(env, policy=random_action_policy)
def view_render(env, agent): def random_policy(time_step): return agent.get_best_action(get_state(time_step.observation)) viewer.launch(env.env, policy=random_policy)
def create_model(self): model = Sequential() model.add(Conv2D(256, (3,3), input_shape=OBSERVATION_SPACE_VALUES)) model.add(Activation("relu") model.add(MaxPooling2D(2,2)) model.add(Dropout(0.2)) model.add(Conv2D(256, (3,3))) model.add(Activation("relu") model.add(MaxPooling2D(2,2)) model.add(Dropout(0.2)) model.add(Flatten()) model.add(Dense(64)) model.Dense(ACTION_SPACE_SIZE, activation="linear") model.compile(Loss="mse", optimizer=Adam(Lr=0.001), metrics=['accuracy']) return model def update_replay_memory(self, transition) self.replay_memory.append(transition) def get_qs(self, terminal_state, step) return self.model_predict(np.array(state).reshape(-1, *state.shape)/255)[0] # -- Probably change this (I believe this is to reshape q value) def train(self, terminal_state, step) if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE: return minibatch = random.sample(self.replay_memory, MINIBATCH_SIZE) current_states = np.array([transition[0] for transition in minibatch])/255 current_qs_list = self.model.predict(current_states) new_current_states = np.array([transition[3] for transition in minibatch])/255 future_qs_list = self.target_model.predict(new_current_states) X = [] y = [] for index, (current_state, action, reward, new_current_state, done) in enumerate(minibatch): if not done: max_future_q = np.max(future_qs_list[index]) new_q = reward + DISCOUNT * max_future_q else: new_q = reward current_qs = current_qs_list[index] current_qs[action] = new_q X.append(current_state) y.append(current_qs) self.model.fit(np.array(X)/255, np.array(y), batch_size = MINIBATCH_SIZE, verbose = 0, shuffle=False, callbacks = [self.tensorboard] if terminal_state else None) if terminal_state: self.target_update_counter += 1 if self.target_update_counter > UPDATE_TARGET_EVERY: self.target_model.set_weights(self.model.get_weights()) self.target_update_counter = 0 # Start DQN agent = DQNAgent() # Set up Environment env = suite.load(domain_name="cartpole", task_name="balance_sparse") initial_values = env.reset() # Recording Performance ep_rewards = [] aggr_ep_rewards = {'ep': [], 'avg': [], 'min': [], 'max': []} for episode in tqdm(range(1, EPISODES+1), ascii=True, unit-"episode"): agent.tensorboard.step = episode episode_reward = 0 step = 1 time_step = env.reset() current_state = np.concatenate((time_step.observation['position'],time_step.observation['velocity'])) while not done: # Decide if taking a random action w/ epsilon if np.random.random() > epsilon: action = np.argmax(agent.get_qs(current_state)) else: action = np.random.randint(0, ACTION_SPACE_SIZE) # Perform the Action in the Environment time_step = env.step(action) reward = time_step.reward new_state = np.concatenate((time_step.observation['position'],time_step.observation['velocity'])) if time_step.discount is None: done = True if not done: episode_reward += time_step.reward agent.update_replay_memory((current_state, action, reward, new_state, done)) agent.train(done, step) current_state = new_state step += 1 if END_EPSILON_DECAYING >= episode >= START_EPSILON_DECAYING: epsilon -= epsilon_decay_value ep_rewards.append(episode_reward) if not episode % SHOW_EVERY: average_reward = sum(ep_rewards[-SHOW_EVERY:])/len(ep_rewards[-SHOW_EVERY:]) aggr_ep_rewards['ep'].append(episode) aggr_ep_rewards['avg'].append(average_reward) aggr_ep_rewards['min'].append(min(ep_rewards[-SHOW_EVERY:])) aggr_ep_rewards['max'].append(max(ep_rewards[-SHOW_EVERY:])) # Set up Environment env = suite.load(domain_name="cartpole", task_name="balance_sparse") initial_values = env.reset() # Get Possible Actions for Environment action_spec = env.action_spec() # Initialize Q Table initial_observations = np.concatenate((initial_values.observation['position'],initial_values.observation['velocity'])) DISCRETE_OS_SIZE = np.array([30] * len(initial_observations)) guess_high_observation = 1.5 guess_low_observation = -1.5 discrete_os_win_size = np.array(([guess_high_observation - guess_low_observation] * 5)) / DISCRETE_OS_SIZE action_space = np.array([50]) # Parameters Learning_Rate = 0.1 Discount = 0.99 Episodes = 10000 SHOW_EVERY = 50 epsilon = 0.5 START_EPSILON_DECAYING = 1 END_EPSILON_DECAYING = Episodes // 1.5 # // Ensures no float epsilon_decay_value = epsilon/(END_EPSILON_DECAYING - START_EPSILON_DECAYING) q_table = np.random.uniform(low=-1,high=1,size=(np.concatenate((DISCRETE_OS_SIZE, action_space)))) # Recording Performance ep_rewards = [] aggr_ep_rewards = {'ep': [], 'avg': [], 'min': [], 'max': []} # Discretize State def get_discrete_state(state): discrete_state = (state - [guess_low_observation,guess_low_observation,guess_low_observation,guess_low_observation,guess_low_observation]) * discrete_os_win_size return tuple(discrete_state.astype(np.int)) discrete_state = get_discrete_state(initial_observations) #print(q_table[discrete_state]) # Go through Episodes for Training for episode in range(Episodes): done = False episode_reward = 0.0 if episode % SHOW_EVERY == 0: print(episode) # Reset Environment initial_values = env.reset() initial_observations = np.concatenate((initial_values.observation['position'],initial_values.observation['velocity'])) discrete_state = get_discrete_state(initial_observations) while not done: # Take a Action within the range of Actions and correct size if np.random.random() > epsilon: action = np.argmax(q_table[discrete_state]) action_take = (action/25)-1 else: action = np.random.randint(0,50) action_take = (action/25)-1 # Perform the Action in the Environment time_step = env.step(action_take) observations = np.concatenate((time_step.observation['position'],time_step.observation['velocity'])) # Get new Discrete Step new_discrete_state = get_discrete_state(observations) if time_step.discount is None: done = True if not done: max_future_q = np.max(q_table[new_discrete_state]) current_q = q_table[discrete_state + (action, )] new_q = (1-Learning_Rate) * current_q + Learning_Rate * (time_step.reward + Discount * max_future_q) q_table[discrete_state + (action, )] = new_q episode_reward += time_step.reward discrete_state = new_discrete_state if END_EPSILON_DECAYING >= episode >= START_EPSILON_DECAYING: epsilon -= epsilon_decay_value ep_rewards.append(episode_reward) if not episode % SHOW_EVERY: average_reward = sum(ep_rewards[-SHOW_EVERY:])/len(ep_rewards[-SHOW_EVERY:]) aggr_ep_rewards['ep'].append(episode) aggr_ep_rewards['avg'].append(average_reward) aggr_ep_rewards['min'].append(min(ep_rewards[-SHOW_EVERY:])) aggr_ep_rewards['max'].append(max(ep_rewards[-SHOW_EVERY:])) # Reset Environment initial_values = env.reset() initial_observations = np.concatenate((initial_values.observation['position'],initial_values.observation['velocity'])) discrete_state = get_discrete_state(initial_observations) done = False # Define a uniform random policy. def random_action_policy(time_step, done = False, discrete_state = get_discrete_state(initial_observations)): # Take a Action within the range of Actions and correct size action = np.argmax(q_table[discrete_state]) # Perform the Action in the Environment time_step = env.step(action) observations = np.concatenate((time_step.observation['position'],time_step.observation['velocity'])) # Get new Discrete Step new_discrete_state = get_discrete_state(observations) if time_step.discount is None: done = True if not done: max_future_q = np.max(q_table[new_discrete_state]) current_q = q_table[discrete_state + (action, )] new_q = (1-Learning_Rate) * current_q + Learning_Rate * (time_step.reward + Discount * max_future_q) q_table[discrete_state + (action, )] = new_q discrete_state = new_discrete_state # Print the Results of the Action print("reward = {}, discount = {}, observations = {}.".format( time_step.reward, time_step.discount, time_step.observation)) return action plt.plot(aggr_ep_rewards['ep'], aggr_ep_rewards['avg'], Label = "avg") plt.plot(aggr_ep_rewards['ep'], aggr_ep_rewards['min'], Label = "min") plt.plot(aggr_ep_rewards['ep'], aggr_ep_rewards['max'], Label = "max") plt.legend(loc=4) plt.show() # Launch the viewer application. viewer.launch(env, policy=random_action_policy)
def play_control_suite(agent, environment): '''Launches an agent in a DeepMind Control Suite-based environment.''' from dm_control import viewer class Wrapper: '''Wrapper used to plug a Tonic environment in a dm_control viewer.''' def __init__(self, environment): self.environment = environment self.unwrapped = environment.unwrapped self.action_spec = self.unwrapped.environment.action_spec self.physics = self.unwrapped.environment.physics self.infos = None self.episodes = 0 def reset(self): '''Mimics a dm_control reset for the viewer.''' self.observations = self.environment.reset()[None] self.score = 0 self.length = 0 return self.unwrapped.last_time_step def step(self, actions): '''Mimics a dm_control step for the viewer.''' ob, rew, term, _ = self.environment.step(actions) self.score += rew self.length += 1 timeout = self.length == self.environment.max_episode_steps done = term or timeout if done: print() self.episodes += 1 print('Episodes:', self.episodes) print('Score:', self.score) print('Length:', self.length) self.observations = ob[None] self.infos = dict(observations=ob[None], rewards=np.array([rew]), resets=np.array([done]), terminations=[term]) return self.unwrapped.last_time_step # Wrap the environment for the viewer. environment = Wrapper(environment) def policy(timestep): '''Mimics a dm_control policy for the viewer.''' if environment.infos is not None: agent.test_update(**environment.infos) return agent.test_step(environment.observations) # Launch the viewer with the wrapped environment and policy. viewer.launch(environment, policy)
def main(unused_argv): viewer.launch(environment_loader=basic_cmu_2019.cmu_humanoid_run_gaps)
def main(): args = parse_args() if not (bool(args.viewer) ^ bool(args.save_path)): raise Exception("you need to provide --viewer xor --save-dir " "arguments for this to do anything useful :)") if args.threads is not None: torch.set_num_threads(args.threads) # TODO: The next few calls are copy-pasted out of train.py. Consider # refactoring so that you don't have to copy-paste (otoh not very important # since this code only needs to be run once) if torch.cuda.is_available(): dev = torch.device('cuda') else: dev = torch.device('cpu') pre_transform_image_size = args.pre_transform_image_size if 'crop' \ in args.data_augs else args.image_size env = dmc2gym.make( domain_name=args.domain_name, task_name=args.task_name, seed=args.seed, visualize_reward=False, from_pixels=(args.encoder_type == 'pixel'), height=pre_transform_image_size, width=pre_transform_image_size, frame_skip=args.action_repeat) env.seed(args.seed) action_shape = env.action_space.shape obs_shape = (3 * args.frame_stack, args.image_size, args.image_size) agent = RadSacAgent( obs_shape=obs_shape, action_shape=action_shape, device=dev, hidden_dim=args.hidden_dim, encoder_type=args.encoder_type, encoder_feature_dim=args.encoder_feature_dim, num_layers=args.num_layers, num_filters=args.num_filters, latent_dim=args.latent_dim, data_augs=args.data_augs, ) agent.load_ac(actor_path=args.actor_path) if args.viewer: dmc_env = unwrap(env) frames = collections.deque(maxlen=args.frame_stack or 1) def loaded_policy(time_step): # time_step just contains joint angles; we want image observation obs = env.env._get_obs(time_step) frames.append(obs) while len(frames) < frames.maxlen: # for init frames.append(obs) stacked_obs = np.concatenate(frames, axis=0) / 255. return agent.sample_action(stacked_obs) viewer.launch(dmc_env, policy=loaded_policy) return # done # otherwise, we need to save a bunch of imitation.data.TrajectoryWithRew # instance to some directory somewhere… all_traj = [] for t in range(args.ntraj): traj = sample_traj_stacked(env, agent, frame_stack=args.frame_stack or 1) all_traj.append(traj) # for now I'm just going to save all trajectories in one file print(f"Saving to '{args.save_path}'") save_compressed_pickle(all_traj, args.save_path) env.close()
from dm_control import composer from dm_control.locomotion.examples import basic_cmu_2019 from dm_control import viewer import numpy as np # Build an example environment. env = basic_cmu_2019.cmu_humanoid_run_walls() viewer.launch(environment_loader=basic_cmu_2019.cmu_humanoid_run_walls) action_spec = env.action_spec() # Step through the environment for one episode with random actions. time_step = env.reset() while not time_step.last(): action = np.random.uniform(action_spec.minimum, action_spec.maximum, size=action_spec.shape) time_step = env.step(action) print("reward = {}, discount = {}, observations = {}.".format( time_step.reward, time_step.discount, time_step.observation)) #viewer.launch(environment_loader=basic_cmu_2019.cmu_humanoid_run_walls)
from lib.dm_control import suite # This hack is so that we can define and use our own envs from dm_control import viewer import numpy as np env = suite.load(domain_name="humanoid_CMU", task_name="stand") action_spec = env.action_spec() # Define a uniform random policy. def random_policy(time_step): del time_step # Unused. return np.random.uniform(low=action_spec.minimum, high=action_spec.maximum, size=action_spec.shape) # Launch the viewer application. viewer.launch(env, policy=None, width=1024, height=768)
def main(unused_argv): viewer.launch(environment_loader=TASKS[FLAGS.task])
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') viewer.launch( environment_loader=functools.partial(soccer.load, team_size=2))
from optimal_agents.morphology import Morphology from optimal_agents.morphology import random2d from optimal_agents.envs.dm_control_env import dm_control_test_env from optimal_agents.morphology import arenas global_kwargs = {"option.timestep": 0.01} geom_kwargs = { "contype": 1, "conaffinity": 1, "condim": 3, "friction": [0.4, 0.1, 0.1], } joint_kwargs = {"damping": 2, "armature": 0.1, "stiffness": 20} morphology = random2d(mutation_kwargs={}) env = dm_control_test_env(morphology, arena=arenas.GM_Terrain()) action_spec = env.action_spec() def random_policy(time_step): del time_step # Unused. return np.random.uniform(low=action_spec.minimum, high=action_spec.maximum, size=action_spec.shape) viewer.launch(env, policy=random_policy)
from dm_control import suite from dm_control import viewer # Load an environment from the Control Suite. env = suite.load(domain_name="humanoid", task_name="stand") # Launch the viewer application. viewer.launch(env)