from dm_control import suite from dm_control.glviz import viz import numpy as np # Load one task: env = suite.load(domain_name="humanoid", task_name="walk") # env = suite.load( domain_name = "cartpole", task_name = "balance" ) # env = suite.load( domain_name = "acrobot", task_name = "swingup" ) # env = suite.load( domain_name = "ball_in_cup", task_name = "catch" ) # env = suite.load( domain_name = "cheetah", task_name = "run" ) # env = suite.load( domain_name = "finger", task_name = "spin" ) # env = suite.load( domain_name = "fish", task_name = "swim" )# needs tweaking : ellipsoid support # env = suite.load( domain_name = "hopper", task_name = "stand" ) # env = suite.load( domain_name = "manipulator", task_name = "bring_ball" )# need tweaking : cylinder support and different lighting position # env = suite.load( domain_name = "pendulum", task_name = "swingup" ) # env = suite.load( domain_name = "point_mass", task_name = "easy" ) # env = suite.load( domain_name = "reacher", task_name = "easy" ) # env = suite.load( domain_name = "swimmer", task_name = "swimmer6" ) # env = suite.load( domain_name = "primitives", task_name = "test" ) visualizer = viz.Visualizer(env.physics) # Step through an episode and print out reward, discount and observation. action_spec = env.action_spec() time_step = env.reset() _paused = False while not time_step.last(): action = np.random.uniform(action_spec.minimum, action_spec.maximum,
t, r, _, s2 = timestep s2 = torch.FloatTensor(utils.state_1d_flat(s2)).to(device) s = s2 ep_reward += r prev_action = a if video_info is not None: video_saver.release() return ep_reward if __name__ == "__main__": env = suite.load(domain_name=domain_name, task_name=task_name) state_dim = utils.state_1d_dim_calc(env)[-1] action_dim = env.action_spec().shape[-1] utils.append_file_writer(record_dir, "exp_detail.txt", "state_dim : " + str(state_dim) + "\n") utils.append_file_writer(record_dir, "exp_detail.txt", "action_dim : " + str(action_dim) + "\n") replay_buffer = ReplayBuffer.ReplayBuffer(buffer_size=buffer_size) MSEcriterion = nn.MSELoss() actor_main = DDPGActor(state_dim, action_dim, actor_lr, device) actor_target = DDPGActor(state_dim, action_dim, actor_lr, device)
def __init__(self, domain, task, task_kwargs=None, visualize_reward=False): self._dmenv = suite.load(domain, task, task_kwargs, visualize_reward) self._viewer = None
def loader(): env = suite.load( domain_name=domain_name, task_name=task_name, task_kwargs=task_kwargs) env.task.visualize_reward = FLAGS.visualize_reward return env
from dm_control import suite from dm_control import viewer import numpy as np #import matplotlib.pyplot as plt # Set up Environment env = suite.load(domain_name="cartpole", task_name="balance_sparse") initial_values = env.reset() # Get Possible Actions for Environment action_spec = env.action_spec() # Initialize Q Table initial_observations = np.concatenate((initial_values.observation['position'],initial_values.observation['velocity'])) DISCRETE_OS_SIZE = np.array([50] * len(initial_observations)) guess_high_observation = 2 guess_low_observation = -2 discrete_os_win_size = np.array(([guess_high_observation - guess_low_observation] * 5)) / DISCRETE_OS_SIZE action_space = np.array([3]) # Parameters Learning_Rate = 0.05 Discount = 0.95 Episodes = 10000 SHOW_EVERY = 50 epsilon = 0.5 START_EPSILON_DECAYING = 1 END_EPSILON_DECAYING = Episodes // 2 # // Ensures no float
def from_suite(cls, domain_name, task_name): return cls(suite.load(domain_name, task_name), name='{}.{}'.format(domain_name, task_name))
def make_environment(evaluation: bool = False): del evaluation # Unused. environment = suite.load('cartpole', 'balance') wrapped = wrappers.SinglePrecisionWrapper(environment) return wrapped
print("---------------------------------------") # info for particular task task_kwargs = {} if args.domain == 'jaco': if args.fence_name == 'jodesk': # .1f is too low - joint 4 hit sometimes!!!! task_kwargs['fence'] = {'x':(-.5,.5), 'y':(-1.0, .4), 'z':(.15, 1.2)} else: task_kwargs['fence'] = {'x':(-5,5), 'y':(-5, 5), 'z':(.15, 1.2)} if args.use_robot: task_kwargs['physics_type'] = 'robot' args.eval_filename_modifier += 'robot' else: task_kwargs['physics_type'] = 'mujoco' _env = suite.load(domain_name=args.domain, task_name=args.task, task_kwargs=task_kwargs, environment_kwargs=environment_kwargs) kwargs = get_kwargs(_env) del _env # if we need to make a movie, must have frames if np.max([args.plot_movie, args.plot_action_movie, args.plot_frames]): args.state_pixels = True if not args.state_pixels: cam_dim = [0,0,0] else: if args.convert_to_gray: cam_dim = [args.frame_height, args.frame_width, 1] else: cam_dim = [args.frame_height, args.frame_width, 3] # Set seeds torch.manual_seed(args.seed)
def evaluate(load_model_filepath): print("starting evaluation for {} episodes".format(args.num_eval_episodes)) policy, train_step, results_dir, loaded_modelpath = load_policy(load_model_filepath) eval_seed = args.seed+train_step task_kwargs['random'] = eval_seed load_model_base = loaded_modelpath.replace('.pt', '') plotting.plot_loss_dict(policy, load_model_base) state_names_dict = get_state_names_dict() train_replay_buffer = load_replay_buffer(load_model_base + '.pkl') eval_env = suite.load(domain_name=args.domain, task_name=args.task, task_kwargs=task_kwargs, environment_kwargs=environment_kwargs) # generate random seed random_state = np.random.RandomState(eval_seed) train_dir = os.path.join(load_model_base + '_train%s'%args.eval_filename_modifier) if not os.path.exists(train_dir): os.makedirs(train_dir) train_base = os.path.join(train_dir, get_step_filename(train_step)+'_train') plotting.plot_replay_reward(train_replay_buffer, train_base, start_step=train_step, name_modifier='train') plotting.plot_states(train_replay_buffer.get_last_steps(train_replay_buffer.size), train_base, detail_dict=state_names_dict) eval_dir = os.path.join(load_model_base + '_eval%s'%args.eval_filename_modifier) if not os.path.exists(eval_dir): os.makedirs(eval_dir) print('saving results to dir: {}'.format(eval_dir)) eval_base = os.path.join(eval_dir, get_step_filename(train_step)+'_eval_S{:05d}'.format(eval_seed)) eval_step_filepath = eval_base + '%s.epkl'%args.eval_filename_modifier if os.path.exists(eval_step_filepath) and not args.overwrite_replay: print('loading existing replay buffer:{}'.format(eval_step_filepath)) eval_replay_buffer = load_replay_buffer(eval_step_filepath) else: eval_replay_buffer = ReplayBuffer(kwargs['state_dim'], kwargs['action_dim'], max_size=int(args.eval_replay_size), cam_dim=cam_dim, seed=eval_seed) for e in range(args.num_eval_episodes): done = False num_steps = 0 state_type, reward, discount, state = eval_env.reset() frame_compressed = get_next_frame(eval_env) # TODO off by one error in step count!? of replay_buffer while done == False: action = ( policy.select_action(state['observations']) ).clip(-kwargs['max_action'], kwargs['max_action']) # Perform action step_type, reward, discount, next_state = eval_env.step(action) next_frame_compressed = get_next_frame(eval_env) done = step_type.last() # Store data in replay buffer eval_replay_buffer.add(state['observations'], action, reward, next_state['observations'], done, frame_compressed=frame_compressed, next_frame_compressed=next_frame_compressed) frame_compressed = next_frame_compressed state = next_state num_steps+=1 time.sleep(.1) # plot episode er = np.int(eval_replay_buffer.episode_rewards[-1]) epath = eval_base+ '_E{}_R{}'.format(e, er) exp = eval_replay_buffer.get_last_steps(num_steps) plotting.plot_states(exp, epath, detail_dict=state_names_dict) if args.domain == 'jaco': plotting.plot_position_actions(exp, epath, relative=True) if np.max([args.plot_movie, args.plot_action_movie, args.plot_frames]): emovie_path = epath+'CAM{}.mp4'.format(e, er, args.camera_view) print('plotting episode: {}'.format(emovie_path)) plotting.plot_frames(emovie_path, eval_replay_buffer.get_last_steps(num_steps), plot_action_frames=args.plot_action_movie, min_action=-kwargs['max_action'], max_action=kwargs['max_action'], plot_frames=args.plot_frames) eval_replay_buffer.shrink_to_last_step() pickle.dump(eval_replay_buffer, open(eval_step_filepath, 'wb')) # plot evaluation plotting.plot_replay_reward(eval_replay_buffer, eval_base, start_step=train_step, name_modifier='eval') plotting.plot_states(eval_replay_buffer.get_last_steps(eval_replay_buffer.size), eval_base, detail_dict=state_names_dict) if np.max([args.plot_movie, args.plot_action_movie, args.plot_frames]): movie_path = eval_base+'_CAM{}.mp4'.format(args.camera_view) plotting.plot_frames(movie_path, eval_replay_buffer.get_last_steps(eval_replay_buffer.size), plot_action_frames=args.plot_action_movie, min_action=-kwargs['max_action'], max_action=kwargs['max_action'], plot_frames=args.plot_frames) return eval_replay_buffer, eval_step_filepath
help="Shuffle REINFORCE samples from episode") parser.add_argument("--record", default=False, action="store_true", help="Make movies of agent") args = parser.parse_args() # create logs if args.experiment_name is None: args.experiment_name = datetime.now().strftime('%b%d_%H-%M-%S') writer = SummaryWriter( log_dir=os.path.join('project/logs', args.experiment_name)) logger = Logger('project/logs', args.experiment_name) # create env env = suite.load(*args.environment.split('-')) # Create model agent = Agent(env=env, H=args.H, K=args.K, traj_length=args.traj_length, softmax=args.softmax, predict_rewards=args.predict_reward, writer=writer, reinforce=args.reinforce, lr=args.lr, temperature=args.temperature, reinforce_lr=args.reinforce_lr, hidden_units=args.hidden_units, batch_size=args.reinforce_batchsize,
def create_model(self): model = Sequential() model.add(Conv2D(256, (3,3), input_shape=OBSERVATION_SPACE_VALUES)) model.add(Activation("relu") model.add(MaxPooling2D(2,2)) model.add(Dropout(0.2)) model.add(Conv2D(256, (3,3))) model.add(Activation("relu") model.add(MaxPooling2D(2,2)) model.add(Dropout(0.2)) model.add(Flatten()) model.add(Dense(64)) model.Dense(ACTION_SPACE_SIZE, activation="linear") model.compile(Loss="mse", optimizer=Adam(Lr=0.001), metrics=['accuracy']) return model def update_replay_memory(self, transition) self.replay_memory.append(transition) def get_qs(self, terminal_state, step) return self.model_predict(np.array(state).reshape(-1, *state.shape)/255)[0] # -- Probably change this (I believe this is to reshape q value) def train(self, terminal_state, step) if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE: return minibatch = random.sample(self.replay_memory, MINIBATCH_SIZE) current_states = np.array([transition[0] for transition in minibatch])/255 current_qs_list = self.model.predict(current_states) new_current_states = np.array([transition[3] for transition in minibatch])/255 future_qs_list = self.target_model.predict(new_current_states) X = [] y = [] for index, (current_state, action, reward, new_current_state, done) in enumerate(minibatch): if not done: max_future_q = np.max(future_qs_list[index]) new_q = reward + DISCOUNT * max_future_q else: new_q = reward current_qs = current_qs_list[index] current_qs[action] = new_q X.append(current_state) y.append(current_qs) self.model.fit(np.array(X)/255, np.array(y), batch_size = MINIBATCH_SIZE, verbose = 0, shuffle=False, callbacks = [self.tensorboard] if terminal_state else None) if terminal_state: self.target_update_counter += 1 if self.target_update_counter > UPDATE_TARGET_EVERY: self.target_model.set_weights(self.model.get_weights()) self.target_update_counter = 0 # Start DQN agent = DQNAgent() # Set up Environment env = suite.load(domain_name="cartpole", task_name="balance_sparse") initial_values = env.reset() # Recording Performance ep_rewards = [] aggr_ep_rewards = {'ep': [], 'avg': [], 'min': [], 'max': []} for episode in tqdm(range(1, EPISODES+1), ascii=True, unit-"episode"): agent.tensorboard.step = episode episode_reward = 0 step = 1 time_step = env.reset() current_state = np.concatenate((time_step.observation['position'],time_step.observation['velocity'])) while not done: # Decide if taking a random action w/ epsilon if np.random.random() > epsilon: action = np.argmax(agent.get_qs(current_state)) else: action = np.random.randint(0, ACTION_SPACE_SIZE) # Perform the Action in the Environment time_step = env.step(action) reward = time_step.reward new_state = np.concatenate((time_step.observation['position'],time_step.observation['velocity'])) if time_step.discount is None: done = True if not done: episode_reward += time_step.reward agent.update_replay_memory((current_state, action, reward, new_state, done)) agent.train(done, step) current_state = new_state step += 1 if END_EPSILON_DECAYING >= episode >= START_EPSILON_DECAYING: epsilon -= epsilon_decay_value ep_rewards.append(episode_reward) if not episode % SHOW_EVERY: average_reward = sum(ep_rewards[-SHOW_EVERY:])/len(ep_rewards[-SHOW_EVERY:]) aggr_ep_rewards['ep'].append(episode) aggr_ep_rewards['avg'].append(average_reward) aggr_ep_rewards['min'].append(min(ep_rewards[-SHOW_EVERY:])) aggr_ep_rewards['max'].append(max(ep_rewards[-SHOW_EVERY:])) # Set up Environment env = suite.load(domain_name="cartpole", task_name="balance_sparse") initial_values = env.reset() # Get Possible Actions for Environment action_spec = env.action_spec() # Initialize Q Table initial_observations = np.concatenate((initial_values.observation['position'],initial_values.observation['velocity'])) DISCRETE_OS_SIZE = np.array([30] * len(initial_observations)) guess_high_observation = 1.5 guess_low_observation = -1.5 discrete_os_win_size = np.array(([guess_high_observation - guess_low_observation] * 5)) / DISCRETE_OS_SIZE action_space = np.array([50]) # Parameters Learning_Rate = 0.1 Discount = 0.99 Episodes = 10000 SHOW_EVERY = 50 epsilon = 0.5 START_EPSILON_DECAYING = 1 END_EPSILON_DECAYING = Episodes // 1.5 # // Ensures no float epsilon_decay_value = epsilon/(END_EPSILON_DECAYING - START_EPSILON_DECAYING) q_table = np.random.uniform(low=-1,high=1,size=(np.concatenate((DISCRETE_OS_SIZE, action_space)))) # Recording Performance ep_rewards = [] aggr_ep_rewards = {'ep': [], 'avg': [], 'min': [], 'max': []} # Discretize State def get_discrete_state(state): discrete_state = (state - [guess_low_observation,guess_low_observation,guess_low_observation,guess_low_observation,guess_low_observation]) * discrete_os_win_size return tuple(discrete_state.astype(np.int)) discrete_state = get_discrete_state(initial_observations) #print(q_table[discrete_state]) # Go through Episodes for Training for episode in range(Episodes): done = False episode_reward = 0.0 if episode % SHOW_EVERY == 0: print(episode) # Reset Environment initial_values = env.reset() initial_observations = np.concatenate((initial_values.observation['position'],initial_values.observation['velocity'])) discrete_state = get_discrete_state(initial_observations) while not done: # Take a Action within the range of Actions and correct size if np.random.random() > epsilon: action = np.argmax(q_table[discrete_state]) action_take = (action/25)-1 else: action = np.random.randint(0,50) action_take = (action/25)-1 # Perform the Action in the Environment time_step = env.step(action_take) observations = np.concatenate((time_step.observation['position'],time_step.observation['velocity'])) # Get new Discrete Step new_discrete_state = get_discrete_state(observations) if time_step.discount is None: done = True if not done: max_future_q = np.max(q_table[new_discrete_state]) current_q = q_table[discrete_state + (action, )] new_q = (1-Learning_Rate) * current_q + Learning_Rate * (time_step.reward + Discount * max_future_q) q_table[discrete_state + (action, )] = new_q episode_reward += time_step.reward discrete_state = new_discrete_state if END_EPSILON_DECAYING >= episode >= START_EPSILON_DECAYING: epsilon -= epsilon_decay_value ep_rewards.append(episode_reward) if not episode % SHOW_EVERY: average_reward = sum(ep_rewards[-SHOW_EVERY:])/len(ep_rewards[-SHOW_EVERY:]) aggr_ep_rewards['ep'].append(episode) aggr_ep_rewards['avg'].append(average_reward) aggr_ep_rewards['min'].append(min(ep_rewards[-SHOW_EVERY:])) aggr_ep_rewards['max'].append(max(ep_rewards[-SHOW_EVERY:])) # Reset Environment initial_values = env.reset() initial_observations = np.concatenate((initial_values.observation['position'],initial_values.observation['velocity'])) discrete_state = get_discrete_state(initial_observations) done = False # Define a uniform random policy. def random_action_policy(time_step, done = False, discrete_state = get_discrete_state(initial_observations)): # Take a Action within the range of Actions and correct size action = np.argmax(q_table[discrete_state]) # Perform the Action in the Environment time_step = env.step(action) observations = np.concatenate((time_step.observation['position'],time_step.observation['velocity'])) # Get new Discrete Step new_discrete_state = get_discrete_state(observations) if time_step.discount is None: done = True if not done: max_future_q = np.max(q_table[new_discrete_state]) current_q = q_table[discrete_state + (action, )] new_q = (1-Learning_Rate) * current_q + Learning_Rate * (time_step.reward + Discount * max_future_q) q_table[discrete_state + (action, )] = new_q discrete_state = new_discrete_state # Print the Results of the Action print("reward = {}, discount = {}, observations = {}.".format( time_step.reward, time_step.discount, time_step.observation)) return action plt.plot(aggr_ep_rewards['ep'], aggr_ep_rewards['avg'], Label = "avg") plt.plot(aggr_ep_rewards['ep'], aggr_ep_rewards['min'], Label = "min") plt.plot(aggr_ep_rewards['ep'], aggr_ep_rewards['max'], Label = "max") plt.legend(loc=4) plt.show() # Launch the viewer application. viewer.launch(env, policy=random_action_policy)
from dm_control import suite import numpy as np from PIL import Image import subprocess import torch seed = 0 env = suite.load(domain_name='cartpole', task_name="two_poles", task_kwargs={'random': seed}) action_spec = env.action_spec() time_step_counter = 0 subprocess.call(['rm', '-rf', 'frames']) subprocess.call(['mkdir', '-p', 'frames']) s = env.reset() env._physics.get_state() # K_LQR = torch.tensor([[ -0.095211883797698 , 23.498594950851146 ,-0.506162305244223 , 5.042039423490390]]) # this is K # K_LQR = torch.tensor([[0.0880, -137.0451, 139.2033, 0.5070, -10.6144, 19.7211]]) # this is K K_LQR = torch.tensor([[0.0670, -115.3641, 112.3498, 0.3516, -2.9878, 4.9511]]) R = 0 time_step = env.reset() States = env._physics.get_state() print(States) while not time_step.last():
def __init__( self, domain_name, task_name, task_kwargs=None, visualize_reward={}, from_pixels=False, height=84, width=84, camera_id=0, frame_skip=1, environment_kwargs=None, channels_first=True, ): assert ("random" in task_kwargs ), "please specify a seed, for deterministic behaviour" self._from_pixels = from_pixels self._height = height self._width = width self._camera_id = camera_id self._frame_skip = frame_skip self._channels_first = channels_first # create task if domain_name == "manipulation": self._env = manipulation.load(task_name, seed=task_kwargs.get("random", 1)) else: self._env = suite.load( domain_name=domain_name, task_name=task_name, task_kwargs=task_kwargs, visualize_reward=visualize_reward, environment_kwargs=environment_kwargs, ) # true and normalized action spaces self._true_action_space = _spec_to_box([self._env.action_spec()]) self._norm_action_space = spaces.Box( low=-1.0, high=1.0, shape=self._true_action_space.shape, dtype=np.float32, ) # create observation space if from_pixels: shape = ([3, height, width] if channels_first else [height, width, 3]) self._observation_space = spaces.Box(low=0, high=255, shape=shape, dtype=np.uint8) else: self._observation_space = _spec_to_box( self._env.observation_spec().values()) self._state_space = _spec_to_box(self._env.observation_spec().values()) self.current_state = None # set seed self.seed(seed=task_kwargs.get("random", 1))
def make_trajectory(domain, task, seed, **trajectory_kwargs): env = suite.load(domain, task, task_kwargs={'random': seed}) policy = uniform_random_policy(env.action_spec(), random=seed) return step_environment(env, policy, **trajectory_kwargs)
import numpy as np import torch import cv2 from dm_control import suite import lib_duju.utils as duju_utils from Model.ReplayBuffer import ReplayBuffer from Model.SAC_base import target_initialize from Model.Discrete_SAC import DiscreteSAC from Model.Discrete_SAC import train_discrete_SAC_max exp_title = "SAC_DM_Discrete_internal" env = suite.load(domain_name="cartpole", task_name="swingup") state_dim = duju_utils.state_1d_dim_calc(env)[-1] action_dim = 2 action_dict = {0: -1.0, 1: 1.0} reward_compensate = 10 alpha = 1.0 lr = 1e-3 gamma = 0.99 device = torch.device("cuda") max_episode = 10000 batch_size = 100
from dm_control import suite from dm_control import viewer import numpy as np # Load one task: env = suite.load(domain_name="quadruped", task_name="fetch", visualize_reward=True) #viewer.launch(env) # Iterate over a task set: #for domain_name, task_name in suite.BENCHMARKING: # env = suite.load(domain_name, task_name) # print(domain_name," ",task_name) #viewer.launch(env) # Step through an episode and print out reward, discount and observation. action_spec = env.action_spec() time_step = env.reset() def random_policy(time_step): del time_step # Unused. return np.random.uniform(low=action_spec.minimum, high=action_spec.maximum, size=action_spec.shape) # return action_spec.minimum # Launch the viewer application. viewer.launch(env, policy=random_policy)
def load(domain_name, task_name, difficulty=None, dynamic=False, background_dataset_path=None, background_dataset_videos="train", background_kwargs=None, camera_kwargs=None, color_kwargs=None, task_kwargs=None, environment_kwargs=None, visualize_reward=False, render_kwargs=None, pixels_only=True, pixels_observation_key="pixels", env_state_wrappers=None): """Returns an environment from a domain name, task name and optional settings. ```python env = suite.load('cartpole', 'balance') ``` Adding a difficulty will configure distractions matching the reference paper for easy, medium, hard. Users can also toggle dynamic properties for distractions. Args: domain_name: A string containing the name of a domain. task_name: A string containing the name of a task. difficulty: Difficulty for the suite. One of 'easy', 'medium', 'hard'. dynamic: Boolean controlling whether distractions are dynamic or static. background_dataset_path: String to the davis directory that contains the video directories. background_dataset_videos: String ('train'/'val') or list of strings of the DAVIS videos to be used for backgrounds. background_kwargs: Dict, overwrites settings for background distractions. camera_kwargs: Dict, overwrites settings for camera distractions. color_kwargs: Dict, overwrites settings for color distractions. task_kwargs: Dict, dm control task kwargs. environment_kwargs: Optional `dict` specifying keyword arguments for the environment. visualize_reward: Optional `bool`. If `True`, object colours in rendered frames are set to indicate the reward at each step. Default `False`. render_kwargs: Dict, render kwargs for pixel wrapper. pixels_only: Boolean controlling the exclusion of states in the observation. pixels_observation_key: Key in the observation used for the rendered image. env_state_wrappers: Env state wrappers to be called before the PixelWrapper. Returns: The requested environment. """ if not is_available(): raise ImportError("dm_control module is not available. Make sure you " "follow the installation instructions from the " "dm_control package.") if difficulty not in [None, "easy", "medium", "hard"]: raise ValueError( "Difficulty should be one of: 'easy', 'medium', 'hard'.") render_kwargs = render_kwargs or {} if "camera_id" not in render_kwargs: render_kwargs["camera_id"] = 2 if domain_name == "quadruped" else 0 env = suite.load(domain_name, task_name, task_kwargs=task_kwargs, environment_kwargs=environment_kwargs, visualize_reward=visualize_reward) # Apply background distractions. if difficulty or background_kwargs: background_dataset_path = (background_dataset_path or suite_utils.DEFAULT_BACKGROUND_PATH) final_background_kwargs = dict() if difficulty: # Get kwargs for the given difficulty. num_videos = suite_utils.DIFFICULTY_NUM_VIDEOS[difficulty] final_background_kwargs.update( suite_utils.get_background_kwargs(domain_name, num_videos, dynamic, background_dataset_path, background_dataset_videos)) else: # Set the dataset path and the videos. final_background_kwargs.update( dict(dataset_path=background_dataset_path, dataset_videos=background_dataset_videos)) if background_kwargs: # Overwrite kwargs with those passed here. final_background_kwargs.update(background_kwargs) env = background.DistractingBackgroundEnv(env, **final_background_kwargs) # Apply camera distractions. if difficulty or camera_kwargs: final_camera_kwargs = dict(camera_id=render_kwargs["camera_id"]) if difficulty: # Get kwargs for the given difficulty. scale = suite_utils.DIFFICULTY_SCALE[difficulty] final_camera_kwargs.update( suite_utils.get_camera_kwargs(domain_name, scale, dynamic)) if camera_kwargs: # Overwrite kwargs with those passed here. final_camera_kwargs.update(camera_kwargs) env = camera.DistractingCameraEnv(env, **final_camera_kwargs) # Apply color distractions. if difficulty or color_kwargs: final_color_kwargs = dict() if difficulty: # Get kwargs for the given difficulty. scale = suite_utils.DIFFICULTY_SCALE[difficulty] final_color_kwargs.update( suite_utils.get_color_kwargs(scale, dynamic)) if color_kwargs: # Overwrite kwargs with those passed here. final_color_kwargs.update(color_kwargs) env = color.DistractingColorEnv(env, **final_color_kwargs) if env_state_wrappers is not None: for wrapper in env_state_wrappers: env = wrapper(env) # Apply Pixel wrapper after distractions. This is needed to ensure the # changes from the distraction wrapper are applied to the MuJoCo environment # before the rendering occurs. env = pixels.Wrapper(env, pixels_only=pixels_only, render_kwargs=render_kwargs, observation_key=pixels_observation_key) return env
def __init__(self, domain, task, *args, env=None, normalize=True, observation_keys=(), goal_keys=(), unwrap_time_limit=True, pixel_wrapper_kwargs=None, **kwargs): assert not args, ( "Gym environments don't support args. Use kwargs instead.") self.normalize = normalize self.unwrap_time_limit = unwrap_time_limit super(DmControlAdapter, self).__init__(domain, task, *args, goal_keys=goal_keys, **kwargs) if env is None: assert (domain is not None and task is not None), (domain, task) env = suite.load( domain_name=domain, task_name=task, task_kwargs=kwargs # TODO(hartikainen): Figure out how to pass kwargs to this guy. # Need to split into `task_kwargs`, `environment_kwargs`, and # `visualize_reward` bool. Check the suite.load(.) in: # https://github.com/deepmind/dm_control/blob/master/dm_control/suite/__init__.py ) self._env_kwargs = kwargs else: assert not kwargs assert domain is None and task is None, (domain, task) if normalize: if (np.any(env.action_spec().minimum != -1) or np.any(env.action_spec().maximum != 1)): env = action_scale.Wrapper(env, minimum=-1.0, maximum=1.0) np.testing.assert_equal(env.action_spec().minimum, -1) np.testing.assert_equal(env.action_spec().maximum, 1) if pixel_wrapper_kwargs is not None: env = pixels.Wrapper(env, **pixel_wrapper_kwargs) self._env = env assert isinstance(env.observation_spec(), OrderedDict) self.observation_keys = (observation_keys or tuple(env.observation_spec().keys())) observation_space = convert_dm_control_to_gym_space( env.observation_spec()) self._observation_space = type(observation_space)([ (name, copy.deepcopy(space)) for name, space in observation_space.spaces.items() if name in self.observation_keys + self.goal_keys ]) action_space = convert_dm_control_to_gym_space(self._env.action_spec()) if len(action_space.shape) > 1: raise NotImplementedError( "Shape of the action space ({}) is not flat, make sure to" " check the implemenation.".format(action_space)) self._action_space = action_space
def __init__( self, level: LevelSelection, frame_skip: int, visualization_parameters: VisualizationParameters, seed: Union[None, int] = None, human_control: bool = False, observation_type: ObservationType = ObservationType.Measurements, custom_reward_threshold: Union[int, float] = None, **kwargs): super().__init__(level, seed, frame_skip, human_control, custom_reward_threshold, visualization_parameters) self.observation_type = observation_type # load and initialize environment domain_name, task_name = self.env_id.split(":") self.env = suite.load(domain_name=domain_name, task_name=task_name, task_kwargs={'random': seed}) if observation_type != ObservationType.Measurements: self.env = pixels.Wrapper( self.env, pixels_only=observation_type == ObservationType.Image) # seed if self.seed is not None: np.random.seed(self.seed) random.seed(self.seed) self.state_space = StateSpace({}) # image observations if observation_type != ObservationType.Measurements: self.state_space['pixels'] = ImageObservationSpace( shape=self.env.observation_spec()['pixels'].shape, high=255) # measurements observations if observation_type != ObservationType.Image: measurements_space_size = 0 measurements_names = [] for observation_space_name, observation_space in self.env.observation_spec( ).items(): if len(observation_space.shape) == 0: measurements_space_size += 1 measurements_names.append(observation_space_name) elif len(observation_space.shape) == 1: measurements_space_size += observation_space.shape[0] measurements_names.extend([ "{}_{}".format(observation_space_name, i) for i in range(observation_space.shape[0]) ]) self.state_space['measurements'] = VectorObservationSpace( shape=measurements_space_size, measurements_names=measurements_names) # actions self.action_space = BoxActionSpace( shape=self.env.action_spec().shape[0], low=self.env.action_spec().minimum, high=self.env.action_spec().maximum) # initialize the state by getting a new state from the environment self.reset_internal_state(True) # render if self.is_rendered: image = self.get_rendered_image() scale = 1 if self.human_control: scale = 2 if not self.native_rendering: self.renderer.create_screen(image.shape[1] * scale, image.shape[0] * scale)
import numpy as np from dm_control import suite from PIL import Image import cv2 import os import glob env = suite.load(domain_name="humanoid", task_name='run') action_spec = env.action_spec() time_step = env.reset() time_step_counter = 0 while not time_step.last() and time_step_counter < 500: action = np.random.uniform(action_spec.minimum, action_spec.maximum, size=action_spec.shape) time_step = env.step(action) image_data = env.physics.render(height=480, width=480, camera_id="back") #img = Image.fromarray(image_data, 'RGB') #image = np.array(img) cv2.imwrite('frames/humanoid-%.3d.jpg' % time_step_counter, image_data) time_step_counter += 1 print(time_step.reward, time_step.discount, time_step.observation) img_array = [] for filename in glob.glob('frames/*.jpg'):
def __init__(self, domain_name, task_name, horizon=None, gamma=0.99, task_kwargs=None, dt=.01, width_screen=480, height_screen=480, camera_id=0, use_pixels=False, pixels_width=64, pixels_height=64): """ Constructor. Args: domain_name (str): name of the environment; task_name (str): name of the task of the environment; horizon (int): the horizon; gamma (float): the discount factor; task_kwargs (dict, None): parameters of the task; dt (float, .01): duration of a control step; width_screen (int, 480): width of the screen; height_screen (int, 480): height of the screen; camera_id (int, 0): position of camera to render the environment; use_pixels (bool, False): if True, pixel observations are used rather than the state vector; pixels_width (int, 64): width of the pixel observation; pixels_height (int, 64): height of the pixel observation; """ # MDP creation self.env = suite.load(domain_name, task_name, task_kwargs=task_kwargs) if use_pixels: self.env = pixels.Wrapper(self.env, render_kwargs={ 'width': pixels_width, 'height': pixels_height }) # get the default horizon if horizon is None: horizon = self.env._step_limit # Hack to ignore dm_control time limit. self.env._step_limit = np.inf if use_pixels: self._convert_observation_space = self._convert_observation_space_pixels self._convert_observation = self._convert_observation_pixels else: self._convert_observation_space = self._convert_observation_space_vector self._convert_observation = self._convert_observation_vector # MDP properties action_space = self._convert_action_space(self.env.action_spec()) observation_space = self._convert_observation_space( self.env.observation_spec()) mdp_info = MDPInfo(observation_space, action_space, gamma, horizon) self._viewer = ImageViewer((width_screen, height_screen), dt) self._camera_id = camera_id super().__init__(mdp_info) self._state = None
def cartpole_environment(seed: int = 42): env = suite.load("cartpole", "swingup", {"random": seed}) return env
args.file_name = f"{args.policy}_{args.domain_name}_{args.batch_size}_{args.seed}" print("---------------------------------------") print(f"Policy: {args.policy}, Env: {args.domain_name}, Seed: {args.seed}") print("---------------------------------------") if not os.path.exists("./results"): os.makedirs("./results") if args.save_model and not os.path.exists("./models"): os.makedirs("./models") if not os.path.exists("./graphs"): os.makedirs("./graphs") env = suite.load(args.domain_name, args.task_name, {"random": args.seed}) # Set seeds np.random.seed(args.seed) temp_timestep = env.reset() state_dim = flat_obs(temp_timestep.observation).shape[0] action_dim = env.action_spec().shape[0] max_action = float(env.action_spec().maximum[0]) kwargs = { "state_dim": state_dim, "action_dim": action_dim, "max_action": max_action, "discount": args.discount, }
from dm_control import suite from dm_control import viewer import numpy as np env = suite.load(domain_name="finger", task_name="turn_easy") action_spec = env.action_spec() time_step = env.reset() def random_policy(time_step): while not time_step.last(): action = np.random.uniform(action_spec.minimum, action_spec.maximum, size=action_spec.shape) time_step = env.step(action) print(time_step.reward, time_step.discount, time_step.observation) viewer.launch(env, policy=random_policy)
from dm_control import suite from dm_control import viewer import numpy as np env = suite.load(domain_name="hopper", task_name="stand") action_spec = env.action_spec() # Define a uniform random policy. def random_policy(time_step): del time_step # Unused. return np.random.uniform(low=action_spec.minimum, high=action_spec.maximum, size=action_spec.shape) # Launch the viewer application. viewer.launch(env, policy=random_policy)
from dm_control import suite from dm_control import viewer # Load an environment from the Control Suite. env = suite.load(domain_name="humanoid", task_name="stand") # Launch the viewer application. viewer.launch(env)
def make_environment(domain_name: str = 'cartpole', task_name: str = 'balance') -> dm_env.Environment: """Creates a control suite environment.""" environment = suite.load(domain_name, task_name) environment = wrappers.SinglePrecisionWrapper(environment) return environment
import gym import random import collections import numpy as np import torch from PIL import Image import subprocess import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import matplotlib.pyplot as plt import dmc2gym from dm_control import suite seed = 0 env = suite.load(domain_name='cartpole', task_name="balance", task_kwargs={'random': seed}) lr_mu = 0.0005 path = "/zhome/38/5/117684/Desktop/Cartpole/network_test/actor_net200" def _flatten_obs(obs): obs_pieces = [] for v in obs.values(): flat = np.array([v]) if np.isscalar(v) else v.ravel() obs_pieces.append(flat) return np.concatenate(obs_pieces, axis=0) class MuNet(nn.Module): def __init__(self, n, o, learning_rate): super(MuNet, self).__init__() # network self.fc1 = nn.Linear(5, 400)
def _reward(self, state, action): self.env.reset() state = np.array(state, dtype='float32') with self.env.physics.reset_context(): self.env.physics.set_state(state) timestep = self.env.step(action) return timestep.reward def _batch_reward(self, state, action): return [self._reward(s, a) for s, a in zip(state, action)] def reward(self, state, action, repeats=1): if state.ndim > 1: return self._batch_reward(state, action) else: return self._reward(state, action) if __name__ == '__main__': env = suite.load('cheetah', 'run') states = np.random.random((32, 18)) actions = np.random.random((32, 6)) oracle = RewardOracle(env) from time import time times = [] for _ in range(5): start = time() rewards = oracle.reward(states, actions) print(rewards[0:5]) times.append(time() - start) print(np.mean(times), np.std(times))
if kwargs.get('mode', 'rgb_array') != 'rgb_array': raise ValueError("Only render mode 'rgb_array' is supported.") del args # Unused del kwargs # Unused return self._env.physics.render(*self._render_size, camera_id=self._camera_id) # envgym = gym.make("Breakout-v4") # envgym = DeepMindWrapper_gym(envgym) # envgym.reset() # for t in range(1000): # img = envgym.render() # s, _, _, _ = envgym.step(envgym.action_space.sample()) from dm_control import suite # from planet.control.wrappers import DeepMindWrapper #env = suite.load('cheetah', 'run') env = suite.load('walker', 'walk') env = DeepMindWrapper(env) env.reset() for t in range(1000): img = env.render() s, _, _, _ = env.step(env.action_space.sample()) print(env.action_space) print(env.observation_space) # env = DeepMindWrapper(env)