Exemple #1
0
from dm_control import suite
from dm_control.glviz import viz
import numpy as np

# Load one task:
env = suite.load(domain_name="humanoid", task_name="walk")
# env = suite.load( domain_name = "cartpole", task_name = "balance" )
# env = suite.load( domain_name = "acrobot", task_name = "swingup" )
# env = suite.load( domain_name = "ball_in_cup", task_name = "catch" )
# env = suite.load( domain_name = "cheetah", task_name = "run" )
# env = suite.load( domain_name = "finger", task_name = "spin" )
# env = suite.load( domain_name = "fish", task_name = "swim" )# needs tweaking : ellipsoid support
# env = suite.load( domain_name = "hopper", task_name = "stand" )
# env = suite.load( domain_name = "manipulator", task_name = "bring_ball" )# need tweaking : cylinder support and different lighting position
# env = suite.load( domain_name = "pendulum", task_name = "swingup" )
# env = suite.load( domain_name = "point_mass", task_name = "easy" )
# env = suite.load( domain_name = "reacher", task_name = "easy" )
# env = suite.load( domain_name = "swimmer", task_name = "swimmer6" )
# env = suite.load( domain_name = "primitives", task_name = "test" )

visualizer = viz.Visualizer(env.physics)

# Step through an episode and print out reward, discount and observation.
action_spec = env.action_spec()
time_step = env.reset()

_paused = False

while not time_step.last():
    action = np.random.uniform(action_spec.minimum,
                               action_spec.maximum,
Exemple #2
0
        t, r, _, s2 = timestep
        s2 = torch.FloatTensor(utils.state_1d_flat(s2)).to(device)

        s = s2
        ep_reward += r
        prev_action = a

    if video_info is not None:
        video_saver.release()

    return ep_reward


if __name__ == "__main__":

    env = suite.load(domain_name=domain_name, task_name=task_name)

    state_dim = utils.state_1d_dim_calc(env)[-1]
    action_dim = env.action_spec().shape[-1]

    utils.append_file_writer(record_dir, "exp_detail.txt",
                             "state_dim : " + str(state_dim) + "\n")
    utils.append_file_writer(record_dir, "exp_detail.txt",
                             "action_dim : " + str(action_dim) + "\n")

    replay_buffer = ReplayBuffer.ReplayBuffer(buffer_size=buffer_size)

    MSEcriterion = nn.MSELoss()

    actor_main = DDPGActor(state_dim, action_dim, actor_lr, device)
    actor_target = DDPGActor(state_dim, action_dim, actor_lr, device)
Exemple #3
0
 def __init__(self, domain, task, task_kwargs=None, visualize_reward=False):
     self._dmenv = suite.load(domain, task, task_kwargs, visualize_reward)
     self._viewer = None
Exemple #4
0
 def loader():
   env = suite.load(
       domain_name=domain_name, task_name=task_name, task_kwargs=task_kwargs)
   env.task.visualize_reward = FLAGS.visualize_reward
   return env
Exemple #5
0
from dm_control import suite
from dm_control import viewer
import numpy as np
#import matplotlib.pyplot as plt

# Set up Environment
env = suite.load(domain_name="cartpole", task_name="balance_sparse")
initial_values = env.reset()


# Get Possible Actions for Environment 
action_spec = env.action_spec()

# Initialize Q Table
initial_observations = np.concatenate((initial_values.observation['position'],initial_values.observation['velocity']))
DISCRETE_OS_SIZE = np.array([50] * len(initial_observations))
guess_high_observation = 2
guess_low_observation = -2
discrete_os_win_size = np.array(([guess_high_observation - guess_low_observation] * 5)) / DISCRETE_OS_SIZE
action_space = np.array([3])

# Parameters
Learning_Rate = 0.05
Discount = 0.95
Episodes = 10000

SHOW_EVERY = 50

epsilon = 0.5
START_EPSILON_DECAYING = 1
END_EPSILON_DECAYING = Episodes // 2 # // Ensures no float
Exemple #6
0
 def from_suite(cls, domain_name, task_name):
     return cls(suite.load(domain_name, task_name),
                name='{}.{}'.format(domain_name, task_name))
def make_environment(evaluation: bool = False):
    del evaluation  # Unused.
    environment = suite.load('cartpole', 'balance')
    wrapped = wrappers.SinglePrecisionWrapper(environment)
    return wrapped
Exemple #8
0
    print("---------------------------------------")
    # info for particular task
    task_kwargs = {}
    if args.domain == 'jaco':
        if args.fence_name == 'jodesk':
            # .1f is too low - joint 4 hit sometimes!!!!
            task_kwargs['fence'] = {'x':(-.5,.5), 'y':(-1.0, .4), 'z':(.15, 1.2)}
        else:
            task_kwargs['fence'] = {'x':(-5,5), 'y':(-5, 5), 'z':(.15, 1.2)}
        if args.use_robot:
            task_kwargs['physics_type'] = 'robot'
            args.eval_filename_modifier += 'robot'
        else:
            task_kwargs['physics_type'] = 'mujoco'

    _env = suite.load(domain_name=args.domain, task_name=args.task, task_kwargs=task_kwargs,  environment_kwargs=environment_kwargs)
    kwargs = get_kwargs(_env)
    del _env

    # if we need to make a movie, must have frames
    if np.max([args.plot_movie, args.plot_action_movie, args.plot_frames]):
        args.state_pixels = True
    if not args.state_pixels:
        cam_dim = [0,0,0]
    else:
        if args.convert_to_gray:
            cam_dim = [args.frame_height, args.frame_width, 1]
        else:
            cam_dim = [args.frame_height, args.frame_width, 3]
    # Set seeds
    torch.manual_seed(args.seed)
Exemple #9
0
def evaluate(load_model_filepath):
    print("starting evaluation for {} episodes".format(args.num_eval_episodes))
    policy, train_step, results_dir, loaded_modelpath = load_policy(load_model_filepath)
    eval_seed = args.seed+train_step
    task_kwargs['random'] = eval_seed
    load_model_base = loaded_modelpath.replace('.pt', '')
    plotting.plot_loss_dict(policy, load_model_base)
    state_names_dict = get_state_names_dict()
    train_replay_buffer = load_replay_buffer(load_model_base + '.pkl')

    eval_env = suite.load(domain_name=args.domain, task_name=args.task, task_kwargs=task_kwargs,  environment_kwargs=environment_kwargs)

    # generate random seed
    random_state = np.random.RandomState(eval_seed)
    train_dir = os.path.join(load_model_base + '_train%s'%args.eval_filename_modifier)
    if not os.path.exists(train_dir):
        os.makedirs(train_dir)
    train_base = os.path.join(train_dir, get_step_filename(train_step)+'_train')
    plotting.plot_replay_reward(train_replay_buffer, train_base, start_step=train_step, name_modifier='train')
    plotting.plot_states(train_replay_buffer.get_last_steps(train_replay_buffer.size), 
                train_base, detail_dict=state_names_dict)
 

    eval_dir = os.path.join(load_model_base + '_eval%s'%args.eval_filename_modifier)
    if not os.path.exists(eval_dir):
        os.makedirs(eval_dir)
    print('saving results to dir: {}'.format(eval_dir))
    eval_base = os.path.join(eval_dir, get_step_filename(train_step)+'_eval_S{:05d}'.format(eval_seed))

    eval_step_filepath = eval_base + '%s.epkl'%args.eval_filename_modifier
    if os.path.exists(eval_step_filepath) and not args.overwrite_replay:
        print('loading existing replay buffer:{}'.format(eval_step_filepath))
        eval_replay_buffer = load_replay_buffer(eval_step_filepath)
    else:

        eval_replay_buffer = ReplayBuffer(kwargs['state_dim'], kwargs['action_dim'], 
                                     max_size=int(args.eval_replay_size), 
                                     cam_dim=cam_dim, seed=eval_seed)
 

        for e in range(args.num_eval_episodes):
            done = False
            num_steps = 0
            state_type, reward, discount, state = eval_env.reset()
            frame_compressed = get_next_frame(eval_env)
            # TODO off by one error in step count!? of replay_buffer
            while done == False:
                action = (
                        policy.select_action(state['observations'])
                    ).clip(-kwargs['max_action'], kwargs['max_action'])
                # Perform action
                step_type, reward, discount, next_state = eval_env.step(action)
                next_frame_compressed = get_next_frame(eval_env)
                done = step_type.last()
                # Store data in replay buffer
                eval_replay_buffer.add(state['observations'], action, reward, 
                                  next_state['observations'], done, 
                                  frame_compressed=frame_compressed, 
                                  next_frame_compressed=next_frame_compressed)

                frame_compressed = next_frame_compressed
                state = next_state
                num_steps+=1
                time.sleep(.1)
 
            # plot episode
            er = np.int(eval_replay_buffer.episode_rewards[-1])
            epath = eval_base+ '_E{}_R{}'.format(e, er)
            exp = eval_replay_buffer.get_last_steps(num_steps)
            plotting.plot_states(exp, epath, detail_dict=state_names_dict)
            if args.domain == 'jaco':
                plotting.plot_position_actions(exp, epath, relative=True)
            if np.max([args.plot_movie, args.plot_action_movie, args.plot_frames]):
                emovie_path = epath+'CAM{}.mp4'.format(e, er, args.camera_view)
                print('plotting episode: {}'.format(emovie_path))
                plotting.plot_frames(emovie_path, 
                                     eval_replay_buffer.get_last_steps(num_steps),
                                      plot_action_frames=args.plot_action_movie,
                                       min_action=-kwargs['max_action'], max_action=kwargs['max_action'], 
                                     plot_frames=args.plot_frames)

    eval_replay_buffer.shrink_to_last_step()
    pickle.dump(eval_replay_buffer, open(eval_step_filepath, 'wb'))
    # plot evaluation
    plotting.plot_replay_reward(eval_replay_buffer, eval_base, start_step=train_step, name_modifier='eval')
    plotting.plot_states(eval_replay_buffer.get_last_steps(eval_replay_buffer.size), 
                eval_base, detail_dict=state_names_dict)

    if np.max([args.plot_movie, args.plot_action_movie, args.plot_frames]):
        movie_path = eval_base+'_CAM{}.mp4'.format(args.camera_view)
        plotting.plot_frames(movie_path, eval_replay_buffer.get_last_steps(eval_replay_buffer.size), plot_action_frames=args.plot_action_movie, min_action=-kwargs['max_action'], max_action=kwargs['max_action'], plot_frames=args.plot_frames)
    return eval_replay_buffer, eval_step_filepath
Exemple #10
0
                    help="Shuffle REINFORCE samples from episode")
parser.add_argument("--record",
                    default=False,
                    action="store_true",
                    help="Make movies of agent")

args = parser.parse_args()

# create logs
if args.experiment_name is None:
    args.experiment_name = datetime.now().strftime('%b%d_%H-%M-%S')
writer = SummaryWriter(
    log_dir=os.path.join('project/logs', args.experiment_name))
logger = Logger('project/logs', args.experiment_name)
# create env
env = suite.load(*args.environment.split('-'))

# Create model
agent = Agent(env=env,
              H=args.H,
              K=args.K,
              traj_length=args.traj_length,
              softmax=args.softmax,
              predict_rewards=args.predict_reward,
              writer=writer,
              reinforce=args.reinforce,
              lr=args.lr,
              temperature=args.temperature,
              reinforce_lr=args.reinforce_lr,
              hidden_units=args.hidden_units,
              batch_size=args.reinforce_batchsize,
Exemple #11
0
    def create_model(self):
        model = Sequential()
        model.add(Conv2D(256, (3,3), input_shape=OBSERVATION_SPACE_VALUES))
        model.add(Activation("relu")
        model.add(MaxPooling2D(2,2))
        model.add(Dropout(0.2))
        
        model.add(Conv2D(256, (3,3)))
        model.add(Activation("relu")
        model.add(MaxPooling2D(2,2))
        model.add(Dropout(0.2))
        
        model.add(Flatten())
        model.add(Dense(64))
        
        model.Dense(ACTION_SPACE_SIZE, activation="linear")
        model.compile(Loss="mse", optimizer=Adam(Lr=0.001), metrics=['accuracy'])
        return model
        
def update_replay_memory(self, transition)
        self.replay_memory.append(transition)
    
def get_qs(self, terminal_state, step)
        return self.model_predict(np.array(state).reshape(-1, *state.shape)/255)[0] # -- Probably change this (I believe this is to reshape q value)
     
def train(self, terminal_state, step)
    if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE:
        return

    minibatch = random.sample(self.replay_memory, MINIBATCH_SIZE)
    
    current_states = np.array([transition[0] for transition in minibatch])/255
    current_qs_list = self.model.predict(current_states)
    
    new_current_states = np.array([transition[3] for transition in minibatch])/255
    future_qs_list = self.target_model.predict(new_current_states)
    
    X = []
    y = []
    
    for index, (current_state, action, reward, new_current_state, done) in enumerate(minibatch):
        if not done:
            max_future_q = np.max(future_qs_list[index])
            new_q = reward + DISCOUNT * max_future_q
            
        else:
            new_q = reward
            
        current_qs = current_qs_list[index]
        current_qs[action] = new_q
        
        X.append(current_state)
        y.append(current_qs)
        
    self.model.fit(np.array(X)/255, np.array(y), batch_size = MINIBATCH_SIZE, verbose = 0, shuffle=False, callbacks = [self.tensorboard] if terminal_state else None)
    
    
   
    if terminal_state:
        self.target_update_counter += 1
        
        
    if self.target_update_counter > UPDATE_TARGET_EVERY:
        self.target_model.set_weights(self.model.get_weights())
        self.target_update_counter = 0

# Start DQN
agent = DQNAgent()

# Set up Environment
env = suite.load(domain_name="cartpole", task_name="balance_sparse")
initial_values = env.reset()

# Recording Performance
ep_rewards = []
aggr_ep_rewards = {'ep': [], 'avg': [], 'min': [], 'max': []}

for episode in tqdm(range(1, EPISODES+1), ascii=True, unit-"episode"):
    agent.tensorboard.step = episode
    
    episode_reward = 0
    step = 1
    time_step = env.reset()
    current_state = np.concatenate((time_step.observation['position'],time_step.observation['velocity']))
    
    while not done:
    
        # Decide if taking a random action w/ epsilon
        if np.random.random() > epsilon:
            action = np.argmax(agent.get_qs(current_state))
        else:
            action = np.random.randint(0, ACTION_SPACE_SIZE)
        
        # Perform the Action in the Environment
        time_step = env.step(action)
        reward = time_step.reward
        new_state = np.concatenate((time_step.observation['position'],time_step.observation['velocity']))
      
        if time_step.discount is None:
            done = True
        
        if not done:
            episode_reward += time_step.reward
            
        agent.update_replay_memory((current_state, action, reward, new_state, done))
        agent.train(done, step)
        
        current_state = new_state
        step += 1
        
    if END_EPSILON_DECAYING >= episode >= START_EPSILON_DECAYING:
        epsilon -= epsilon_decay_value
        
    ep_rewards.append(episode_reward)
    
    if not episode % SHOW_EVERY:
        average_reward = sum(ep_rewards[-SHOW_EVERY:])/len(ep_rewards[-SHOW_EVERY:])
        aggr_ep_rewards['ep'].append(episode)
        aggr_ep_rewards['avg'].append(average_reward)
        aggr_ep_rewards['min'].append(min(ep_rewards[-SHOW_EVERY:]))
        aggr_ep_rewards['max'].append(max(ep_rewards[-SHOW_EVERY:]))
    
    
    
      
        
        










# Set up Environment
env = suite.load(domain_name="cartpole", task_name="balance_sparse")
initial_values = env.reset()


# Get Possible Actions for Environment 
action_spec = env.action_spec()

# Initialize Q Table
initial_observations = np.concatenate((initial_values.observation['position'],initial_values.observation['velocity']))
DISCRETE_OS_SIZE = np.array([30] * len(initial_observations))
guess_high_observation = 1.5
guess_low_observation = -1.5
discrete_os_win_size = np.array(([guess_high_observation - guess_low_observation] * 5)) / DISCRETE_OS_SIZE
action_space = np.array([50])

# Parameters
Learning_Rate = 0.1
Discount = 0.99
Episodes = 10000

SHOW_EVERY = 50

epsilon = 0.5
START_EPSILON_DECAYING = 1
END_EPSILON_DECAYING = Episodes // 1.5 # // Ensures no float

epsilon_decay_value = epsilon/(END_EPSILON_DECAYING - START_EPSILON_DECAYING)

q_table = np.random.uniform(low=-1,high=1,size=(np.concatenate((DISCRETE_OS_SIZE, action_space))))

# Recording Performance
ep_rewards = []
aggr_ep_rewards = {'ep': [], 'avg': [], 'min': [], 'max': []}

# Discretize State
def get_discrete_state(state):
    discrete_state = (state - [guess_low_observation,guess_low_observation,guess_low_observation,guess_low_observation,guess_low_observation]) * discrete_os_win_size
    return tuple(discrete_state.astype(np.int))
    
discrete_state = get_discrete_state(initial_observations)
#print(q_table[discrete_state])

# Go through Episodes for Training
for episode in range(Episodes):
    done = False
    episode_reward = 0.0
    if episode % SHOW_EVERY == 0:
        print(episode)
    
    # Reset Environment
    initial_values = env.reset()
    initial_observations = np.concatenate((initial_values.observation['position'],initial_values.observation['velocity']))
    discrete_state = get_discrete_state(initial_observations)
    
    while not done:
      # Take a Action within the range of Actions and correct size
      if np.random.random() > epsilon:
        action = np.argmax(q_table[discrete_state])
        action_take = (action/25)-1
      else:
        action = np.random.randint(0,50)
        action_take = (action/25)-1
                               
      # Perform the Action in the Environment
      time_step = env.step(action_take)
      observations = np.concatenate((time_step.observation['position'],time_step.observation['velocity']))
      
      # Get new Discrete Step
      new_discrete_state = get_discrete_state(observations)
      
      if time_step.discount is None:
        done = True
      
      if not done:
        max_future_q = np.max(q_table[new_discrete_state])
        current_q = q_table[discrete_state + (action, )]
        new_q = (1-Learning_Rate) * current_q + Learning_Rate * (time_step.reward + Discount * max_future_q)
        q_table[discrete_state + (action, )] = new_q
        episode_reward += time_step.reward
        
      discrete_state = new_discrete_state
      
    if END_EPSILON_DECAYING >= episode >= START_EPSILON_DECAYING:
        epsilon -= epsilon_decay_value
        
    ep_rewards.append(episode_reward)
    
    if not episode % SHOW_EVERY:
        average_reward = sum(ep_rewards[-SHOW_EVERY:])/len(ep_rewards[-SHOW_EVERY:])
        aggr_ep_rewards['ep'].append(episode)
        aggr_ep_rewards['avg'].append(average_reward)
        aggr_ep_rewards['min'].append(min(ep_rewards[-SHOW_EVERY:]))
        aggr_ep_rewards['max'].append(max(ep_rewards[-SHOW_EVERY:]))
        
    
# Reset Environment
initial_values = env.reset()
initial_observations = np.concatenate((initial_values.observation['position'],initial_values.observation['velocity']))
discrete_state = get_discrete_state(initial_observations)
done = False


# Define a uniform random policy.
def random_action_policy(time_step, done = False, discrete_state = get_discrete_state(initial_observations)):

  # Take a Action within the range of Actions and correct size
  action = np.argmax(q_table[discrete_state])
                           
  # Perform the Action in the Environment
  time_step = env.step(action)
  observations = np.concatenate((time_step.observation['position'],time_step.observation['velocity']))
  
  # Get new Discrete Step
  new_discrete_state = get_discrete_state(observations)
  
  if time_step.discount is None:
    done = True
  
  if not done:
    max_future_q = np.max(q_table[new_discrete_state])
    current_q = q_table[discrete_state + (action, )]
    new_q = (1-Learning_Rate) * current_q + Learning_Rate * (time_step.reward + Discount * max_future_q)
    q_table[discrete_state + (action, )] = new_q
    
  discrete_state = new_discrete_state
  
  # Print the Results of the Action
  print("reward = {}, discount = {}, observations = {}.".format(
    time_step.reward, time_step.discount, time_step.observation)) 
  return action   

plt.plot(aggr_ep_rewards['ep'], aggr_ep_rewards['avg'], Label = "avg")
plt.plot(aggr_ep_rewards['ep'], aggr_ep_rewards['min'], Label = "min")
plt.plot(aggr_ep_rewards['ep'], aggr_ep_rewards['max'], Label = "max")
plt.legend(loc=4)
plt.show()

# Launch the viewer application.
viewer.launch(env, policy=random_action_policy)
Exemple #12
0
from dm_control import suite
import numpy as np
from PIL import Image
import subprocess
import torch
seed = 0
env = suite.load(domain_name='cartpole',
                 task_name="two_poles",
                 task_kwargs={'random': seed})

action_spec = env.action_spec()

time_step_counter = 0

subprocess.call(['rm', '-rf', 'frames'])
subprocess.call(['mkdir', '-p', 'frames'])
s = env.reset()

env._physics.get_state()
# K_LQR = torch.tensor([[ -0.095211883797698 , 23.498594950851146  ,-0.506162305244223 ,  5.042039423490390]]) # this is K
# K_LQR = torch.tensor([[0.0880, -137.0451, 139.2033, 0.5070, -10.6144, 19.7211]]) # this is K
K_LQR = torch.tensor([[0.0670, -115.3641, 112.3498, 0.3516, -2.9878, 4.9511]])

R = 0

time_step = env.reset()

States = env._physics.get_state()
print(States)

while not time_step.last():
Exemple #13
0
    def __init__(
        self,
        domain_name,
        task_name,
        task_kwargs=None,
        visualize_reward={},
        from_pixels=False,
        height=84,
        width=84,
        camera_id=0,
        frame_skip=1,
        environment_kwargs=None,
        channels_first=True,
    ):
        assert ("random" in task_kwargs
                ), "please specify a seed, for deterministic behaviour"
        self._from_pixels = from_pixels
        self._height = height
        self._width = width
        self._camera_id = camera_id
        self._frame_skip = frame_skip
        self._channels_first = channels_first

        # create task
        if domain_name == "manipulation":
            self._env = manipulation.load(task_name,
                                          seed=task_kwargs.get("random", 1))
        else:
            self._env = suite.load(
                domain_name=domain_name,
                task_name=task_name,
                task_kwargs=task_kwargs,
                visualize_reward=visualize_reward,
                environment_kwargs=environment_kwargs,
            )

        # true and normalized action spaces
        self._true_action_space = _spec_to_box([self._env.action_spec()])
        self._norm_action_space = spaces.Box(
            low=-1.0,
            high=1.0,
            shape=self._true_action_space.shape,
            dtype=np.float32,
        )

        # create observation space
        if from_pixels:
            shape = ([3, height, width]
                     if channels_first else [height, width, 3])
            self._observation_space = spaces.Box(low=0,
                                                 high=255,
                                                 shape=shape,
                                                 dtype=np.uint8)
        else:
            self._observation_space = _spec_to_box(
                self._env.observation_spec().values())

        self._state_space = _spec_to_box(self._env.observation_spec().values())

        self.current_state = None

        # set seed
        self.seed(seed=task_kwargs.get("random", 1))
Exemple #14
0
def make_trajectory(domain, task, seed, **trajectory_kwargs):
    env = suite.load(domain, task, task_kwargs={'random': seed})
    policy = uniform_random_policy(env.action_spec(), random=seed)
    return step_environment(env, policy, **trajectory_kwargs)
import numpy as np
import torch
import cv2
from dm_control import suite

import lib_duju.utils as duju_utils

from Model.ReplayBuffer import ReplayBuffer
from Model.SAC_base import target_initialize

from Model.Discrete_SAC import DiscreteSAC
from Model.Discrete_SAC import train_discrete_SAC_max

exp_title = "SAC_DM_Discrete_internal"

env = suite.load(domain_name="cartpole", task_name="swingup")

state_dim = duju_utils.state_1d_dim_calc(env)[-1]
action_dim = 2

action_dict = {0: -1.0, 1: 1.0}

reward_compensate = 10
alpha = 1.0

lr = 1e-3
gamma = 0.99
device = torch.device("cuda")
max_episode = 10000
batch_size = 100
from dm_control import suite
from dm_control import viewer
import numpy as np

# Load one task:
env = suite.load(domain_name="quadruped",
                 task_name="fetch",
                 visualize_reward=True)

#viewer.launch(env)
# Iterate over a task set:
#for domain_name, task_name in suite.BENCHMARKING:
#  env = suite.load(domain_name, task_name)
#  print(domain_name," ",task_name)
#viewer.launch(env)
# Step through an episode and print out reward, discount and observation.
action_spec = env.action_spec()
time_step = env.reset()


def random_policy(time_step):
    del time_step  # Unused.
    return np.random.uniform(low=action_spec.minimum,
                             high=action_spec.maximum,
                             size=action_spec.shape)


#  return action_spec.minimum

# Launch the viewer application.
viewer.launch(env, policy=random_policy)
Exemple #17
0
def load(domain_name,
         task_name,
         difficulty=None,
         dynamic=False,
         background_dataset_path=None,
         background_dataset_videos="train",
         background_kwargs=None,
         camera_kwargs=None,
         color_kwargs=None,
         task_kwargs=None,
         environment_kwargs=None,
         visualize_reward=False,
         render_kwargs=None,
         pixels_only=True,
         pixels_observation_key="pixels",
         env_state_wrappers=None):
    """Returns an environment from a domain name, task name and optional settings.

  ```python
  env = suite.load('cartpole', 'balance')
  ```

  Adding a difficulty will configure distractions matching the reference paper
  for easy, medium, hard.

  Users can also toggle dynamic properties for distractions.

  Args:
    domain_name: A string containing the name of a domain.
    task_name: A string containing the name of a task.
    difficulty: Difficulty for the suite. One of 'easy', 'medium', 'hard'.
    dynamic: Boolean controlling whether distractions are dynamic or static.
    background_dataset_path: String to the davis directory that contains the
      video directories.
    background_dataset_videos: String ('train'/'val') or list of strings of the
      DAVIS videos to be used for backgrounds.
    background_kwargs: Dict, overwrites settings for background distractions.
    camera_kwargs: Dict, overwrites settings for camera distractions.
    color_kwargs: Dict, overwrites settings for color distractions.
    task_kwargs: Dict, dm control task kwargs.
    environment_kwargs: Optional `dict` specifying keyword arguments for the
      environment.
    visualize_reward: Optional `bool`. If `True`, object colours in rendered
      frames are set to indicate the reward at each step. Default `False`.
    render_kwargs: Dict, render kwargs for pixel wrapper.
    pixels_only: Boolean controlling the exclusion of states in the observation.
    pixels_observation_key: Key in the observation used for the rendered image.
    env_state_wrappers: Env state wrappers to be called before the PixelWrapper.

  Returns:
    The requested environment.
  """
    if not is_available():
        raise ImportError("dm_control module is not available. Make sure you "
                          "follow the installation instructions from the "
                          "dm_control package.")

    if difficulty not in [None, "easy", "medium", "hard"]:
        raise ValueError(
            "Difficulty should be one of: 'easy', 'medium', 'hard'.")

    render_kwargs = render_kwargs or {}
    if "camera_id" not in render_kwargs:
        render_kwargs["camera_id"] = 2 if domain_name == "quadruped" else 0

    env = suite.load(domain_name,
                     task_name,
                     task_kwargs=task_kwargs,
                     environment_kwargs=environment_kwargs,
                     visualize_reward=visualize_reward)

    # Apply background distractions.
    if difficulty or background_kwargs:
        background_dataset_path = (background_dataset_path
                                   or suite_utils.DEFAULT_BACKGROUND_PATH)
        final_background_kwargs = dict()
        if difficulty:
            # Get kwargs for the given difficulty.
            num_videos = suite_utils.DIFFICULTY_NUM_VIDEOS[difficulty]
            final_background_kwargs.update(
                suite_utils.get_background_kwargs(domain_name, num_videos,
                                                  dynamic,
                                                  background_dataset_path,
                                                  background_dataset_videos))
        else:
            # Set the dataset path and the videos.
            final_background_kwargs.update(
                dict(dataset_path=background_dataset_path,
                     dataset_videos=background_dataset_videos))
        if background_kwargs:
            # Overwrite kwargs with those passed here.
            final_background_kwargs.update(background_kwargs)
        env = background.DistractingBackgroundEnv(env,
                                                  **final_background_kwargs)

    # Apply camera distractions.
    if difficulty or camera_kwargs:
        final_camera_kwargs = dict(camera_id=render_kwargs["camera_id"])
        if difficulty:
            # Get kwargs for the given difficulty.
            scale = suite_utils.DIFFICULTY_SCALE[difficulty]
            final_camera_kwargs.update(
                suite_utils.get_camera_kwargs(domain_name, scale, dynamic))
        if camera_kwargs:
            # Overwrite kwargs with those passed here.
            final_camera_kwargs.update(camera_kwargs)
        env = camera.DistractingCameraEnv(env, **final_camera_kwargs)

    # Apply color distractions.
    if difficulty or color_kwargs:
        final_color_kwargs = dict()
        if difficulty:
            # Get kwargs for the given difficulty.
            scale = suite_utils.DIFFICULTY_SCALE[difficulty]
            final_color_kwargs.update(
                suite_utils.get_color_kwargs(scale, dynamic))
        if color_kwargs:
            # Overwrite kwargs with those passed here.
            final_color_kwargs.update(color_kwargs)
        env = color.DistractingColorEnv(env, **final_color_kwargs)

    if env_state_wrappers is not None:
        for wrapper in env_state_wrappers:
            env = wrapper(env)
    # Apply Pixel wrapper after distractions. This is needed to ensure the
    # changes from the distraction wrapper are applied to the MuJoCo environment
    # before the rendering occurs.
    env = pixels.Wrapper(env,
                         pixels_only=pixels_only,
                         render_kwargs=render_kwargs,
                         observation_key=pixels_observation_key)

    return env
Exemple #18
0
    def __init__(self,
                 domain,
                 task,
                 *args,
                 env=None,
                 normalize=True,
                 observation_keys=(),
                 goal_keys=(),
                 unwrap_time_limit=True,
                 pixel_wrapper_kwargs=None,
                 **kwargs):
        assert not args, (
            "Gym environments don't support args. Use kwargs instead.")

        self.normalize = normalize
        self.unwrap_time_limit = unwrap_time_limit

        super(DmControlAdapter, self).__init__(domain,
                                               task,
                                               *args,
                                               goal_keys=goal_keys,
                                               **kwargs)

        if env is None:
            assert (domain is not None and task is not None), (domain, task)
            env = suite.load(
                domain_name=domain,
                task_name=task,
                task_kwargs=kwargs
                # TODO(hartikainen): Figure out how to pass kwargs to this guy.
                # Need to split into `task_kwargs`, `environment_kwargs`, and
                # `visualize_reward` bool. Check the suite.load(.) in:
                # https://github.com/deepmind/dm_control/blob/master/dm_control/suite/__init__.py
            )
            self._env_kwargs = kwargs
        else:
            assert not kwargs
            assert domain is None and task is None, (domain, task)

        if normalize:
            if (np.any(env.action_spec().minimum != -1)
                    or np.any(env.action_spec().maximum != 1)):
                env = action_scale.Wrapper(env, minimum=-1.0, maximum=1.0)
            np.testing.assert_equal(env.action_spec().minimum, -1)
            np.testing.assert_equal(env.action_spec().maximum, 1)

        if pixel_wrapper_kwargs is not None:
            env = pixels.Wrapper(env, **pixel_wrapper_kwargs)

        self._env = env

        assert isinstance(env.observation_spec(), OrderedDict)
        self.observation_keys = (observation_keys
                                 or tuple(env.observation_spec().keys()))

        observation_space = convert_dm_control_to_gym_space(
            env.observation_spec())

        self._observation_space = type(observation_space)([
            (name, copy.deepcopy(space))
            for name, space in observation_space.spaces.items()
            if name in self.observation_keys + self.goal_keys
        ])

        action_space = convert_dm_control_to_gym_space(self._env.action_spec())

        if len(action_space.shape) > 1:
            raise NotImplementedError(
                "Shape of the action space ({}) is not flat, make sure to"
                " check the implemenation.".format(action_space))

        self._action_space = action_space
    def __init__(
            self,
            level: LevelSelection,
            frame_skip: int,
            visualization_parameters: VisualizationParameters,
            seed: Union[None, int] = None,
            human_control: bool = False,
            observation_type: ObservationType = ObservationType.Measurements,
            custom_reward_threshold: Union[int, float] = None,
            **kwargs):
        super().__init__(level, seed, frame_skip, human_control,
                         custom_reward_threshold, visualization_parameters)

        self.observation_type = observation_type

        # load and initialize environment
        domain_name, task_name = self.env_id.split(":")
        self.env = suite.load(domain_name=domain_name,
                              task_name=task_name,
                              task_kwargs={'random': seed})

        if observation_type != ObservationType.Measurements:
            self.env = pixels.Wrapper(
                self.env,
                pixels_only=observation_type == ObservationType.Image)

        # seed
        if self.seed is not None:
            np.random.seed(self.seed)
            random.seed(self.seed)

        self.state_space = StateSpace({})

        # image observations
        if observation_type != ObservationType.Measurements:
            self.state_space['pixels'] = ImageObservationSpace(
                shape=self.env.observation_spec()['pixels'].shape, high=255)

        # measurements observations
        if observation_type != ObservationType.Image:
            measurements_space_size = 0
            measurements_names = []
            for observation_space_name, observation_space in self.env.observation_spec(
            ).items():
                if len(observation_space.shape) == 0:
                    measurements_space_size += 1
                    measurements_names.append(observation_space_name)
                elif len(observation_space.shape) == 1:
                    measurements_space_size += observation_space.shape[0]
                    measurements_names.extend([
                        "{}_{}".format(observation_space_name, i)
                        for i in range(observation_space.shape[0])
                    ])
            self.state_space['measurements'] = VectorObservationSpace(
                shape=measurements_space_size,
                measurements_names=measurements_names)

        # actions
        self.action_space = BoxActionSpace(
            shape=self.env.action_spec().shape[0],
            low=self.env.action_spec().minimum,
            high=self.env.action_spec().maximum)

        # initialize the state by getting a new state from the environment
        self.reset_internal_state(True)

        # render
        if self.is_rendered:
            image = self.get_rendered_image()
            scale = 1
            if self.human_control:
                scale = 2
            if not self.native_rendering:
                self.renderer.create_screen(image.shape[1] * scale,
                                            image.shape[0] * scale)
import numpy as np
from dm_control import suite
from PIL import Image

import cv2
import os
import glob

env = suite.load(domain_name="humanoid", task_name='run')

action_spec = env.action_spec()
time_step = env.reset()
time_step_counter = 0

while not time_step.last() and time_step_counter < 500:
    action = np.random.uniform(action_spec.minimum,
                               action_spec.maximum,
                               size=action_spec.shape)

    time_step = env.step(action)

    image_data = env.physics.render(height=480, width=480, camera_id="back")
    #img = Image.fromarray(image_data, 'RGB')
    #image = np.array(img)
    cv2.imwrite('frames/humanoid-%.3d.jpg' % time_step_counter, image_data)

    time_step_counter += 1
    print(time_step.reward, time_step.discount, time_step.observation)

    img_array = []
    for filename in glob.glob('frames/*.jpg'):
Exemple #21
0
    def __init__(self,
                 domain_name,
                 task_name,
                 horizon=None,
                 gamma=0.99,
                 task_kwargs=None,
                 dt=.01,
                 width_screen=480,
                 height_screen=480,
                 camera_id=0,
                 use_pixels=False,
                 pixels_width=64,
                 pixels_height=64):
        """
        Constructor.

        Args:
             domain_name (str): name of the environment;
             task_name (str): name of the task of the environment;
             horizon (int): the horizon;
             gamma (float): the discount factor;
             task_kwargs (dict, None): parameters of the task;
             dt (float, .01): duration of a control step;
             width_screen (int, 480): width of the screen;
             height_screen (int, 480): height of the screen;
             camera_id (int, 0): position of camera to render the environment;
             use_pixels (bool, False): if True, pixel observations are used
                rather than the state vector;
             pixels_width (int, 64): width of the pixel observation;
             pixels_height (int, 64): height of the pixel observation;

        """
        # MDP creation
        self.env = suite.load(domain_name, task_name, task_kwargs=task_kwargs)
        if use_pixels:
            self.env = pixels.Wrapper(self.env,
                                      render_kwargs={
                                          'width': pixels_width,
                                          'height': pixels_height
                                      })

        # get the default horizon
        if horizon is None:
            horizon = self.env._step_limit

        # Hack to ignore dm_control time limit.
        self.env._step_limit = np.inf

        if use_pixels:
            self._convert_observation_space = self._convert_observation_space_pixels
            self._convert_observation = self._convert_observation_pixels
        else:
            self._convert_observation_space = self._convert_observation_space_vector
            self._convert_observation = self._convert_observation_vector

        # MDP properties
        action_space = self._convert_action_space(self.env.action_spec())
        observation_space = self._convert_observation_space(
            self.env.observation_spec())
        mdp_info = MDPInfo(observation_space, action_space, gamma, horizon)

        self._viewer = ImageViewer((width_screen, height_screen), dt)
        self._camera_id = camera_id

        super().__init__(mdp_info)

        self._state = None
Exemple #22
0
def cartpole_environment(seed: int = 42):
    env = suite.load("cartpole", "swingup", {"random": seed})
    return env
Exemple #23
0
    args.file_name = f"{args.policy}_{args.domain_name}_{args.batch_size}_{args.seed}"
    print("---------------------------------------")
    print(f"Policy: {args.policy}, Env: {args.domain_name}, Seed: {args.seed}")
    print("---------------------------------------")

    if not os.path.exists("./results"):
        os.makedirs("./results")

    if args.save_model and not os.path.exists("./models"):
        os.makedirs("./models")

    if not os.path.exists("./graphs"):
        os.makedirs("./graphs")

    env = suite.load(args.domain_name, args.task_name, {"random": args.seed})

    # Set seeds
    np.random.seed(args.seed)

    temp_timestep = env.reset()
    state_dim = flat_obs(temp_timestep.observation).shape[0]
    action_dim = env.action_spec().shape[0]
    max_action = float(env.action_spec().maximum[0])

    kwargs = {
        "state_dim": state_dim,
        "action_dim": action_dim,
        "max_action": max_action,
        "discount": args.discount,
    }
Exemple #24
0
from dm_control import suite
from dm_control import viewer
import numpy as np

env = suite.load(domain_name="finger", task_name="turn_easy")

action_spec = env.action_spec()
time_step = env.reset()


def random_policy(time_step):
    while not time_step.last():
        action = np.random.uniform(action_spec.minimum,
                                   action_spec.maximum,
                                   size=action_spec.shape)
        time_step = env.step(action)
        print(time_step.reward, time_step.discount, time_step.observation)


viewer.launch(env, policy=random_policy)
from dm_control import suite
from dm_control import viewer
import numpy as np

env = suite.load(domain_name="hopper", task_name="stand")
action_spec = env.action_spec()


# Define a uniform random policy.
def random_policy(time_step):
    del time_step  # Unused.
    return np.random.uniform(low=action_spec.minimum,
                             high=action_spec.maximum,
                             size=action_spec.shape)


# Launch the viewer application.
viewer.launch(env, policy=random_policy)
from dm_control import suite
from dm_control import viewer

# Load an environment from the Control Suite.
env = suite.load(domain_name="humanoid", task_name="stand")

# Launch the viewer application.
viewer.launch(env)
Exemple #27
0
def make_environment(domain_name: str = 'cartpole',
                     task_name: str = 'balance') -> dm_env.Environment:
    """Creates a control suite environment."""
    environment = suite.load(domain_name, task_name)
    environment = wrappers.SinglePrecisionWrapper(environment)
    return environment
import gym
import random
import collections
import numpy as np
import torch
from PIL import Image
import subprocess
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
import dmc2gym
from dm_control import suite

seed = 0
env = suite.load(domain_name='cartpole', task_name="balance", task_kwargs={'random': seed})
lr_mu        = 0.0005
path = "/zhome/38/5/117684/Desktop/Cartpole/network_test/actor_net200"
def _flatten_obs(obs):
    obs_pieces = []
    for v in obs.values():
        flat = np.array([v]) if np.isscalar(v) else v.ravel()
        obs_pieces.append(flat)
    return np.concatenate(obs_pieces, axis=0)
class MuNet(nn.Module):
    
    def __init__(self, n, o, learning_rate):
        super(MuNet, self).__init__()
        # network
                
        self.fc1 = nn.Linear(5, 400)
Exemple #29
0
    def _reward(self, state, action):
        self.env.reset()
        state = np.array(state, dtype='float32')
        with self.env.physics.reset_context():
            self.env.physics.set_state(state)
        timestep = self.env.step(action)
        return timestep.reward

    def _batch_reward(self, state, action):
        return [self._reward(s, a) for s, a in zip(state, action)]

    def reward(self, state, action, repeats=1):
        if state.ndim > 1:
            return self._batch_reward(state, action)
        else:
            return self._reward(state, action)


if __name__ == '__main__':
    env = suite.load('cheetah', 'run')
    states = np.random.random((32, 18))
    actions = np.random.random((32, 6))
    oracle = RewardOracle(env)
    from time import time
    times = []
    for _ in range(5):
        start = time()
        rewards = oracle.reward(states, actions)
        print(rewards[0:5])
        times.append(time() - start)
    print(np.mean(times), np.std(times))
Exemple #30
0
        if kwargs.get('mode', 'rgb_array') != 'rgb_array':
            raise ValueError("Only render mode 'rgb_array' is supported.")
        del args  # Unused
        del kwargs  # Unused
        return self._env.physics.render(*self._render_size,
                                        camera_id=self._camera_id)


# envgym = gym.make("Breakout-v4")
# envgym = DeepMindWrapper_gym(envgym)
# envgym.reset()
# for t in range(1000):
#   img = envgym.render()
#   s, _, _, _ = envgym.step(envgym.action_space.sample())

from dm_control import suite
# from planet.control.wrappers import DeepMindWrapper
#env = suite.load('cheetah', 'run')
env = suite.load('walker', 'walk')
env = DeepMindWrapper(env)

env.reset()
for t in range(1000):
    img = env.render()
    s, _, _, _ = env.step(env.action_space.sample())

print(env.action_space)
print(env.observation_space)

# env = DeepMindWrapper(env)