def __init__(self, show_gui=True, center_camera=False, use_theta=True, use_rk4=True, dynamics_noise=False, noise_sigma=0.01): high = np.array([ 2.3, np.finfo(np.float32).max, np.finfo(np.float32).max, np.finfo(np.float32).max]) self._observation_space = spaces.Box(-high, high) #[theta, alpha, theta_dot, alpha_dot] self._action_space = spaces.Box(np.array([-5.0]), np.array([5.0])) self.show_gui = show_gui self.use_theta = use_theta self.use_rk4 = use_rk4 self.dynamic_noise = dynamics_noise self.noise_sigma = noise_sigma*self.fs self.noise_mu = 0 if self.use_theta: self.theta_lin_param = np.array([self.Mp*self.Lr**2+self.Jr, 1/4*self.Mp*self.Lp**2, 1/2*self.Mp*self.Lp*self.Lr, self.Jp+1/4*self.Mp*self.Lp**2, 1/2*self.Mp*self.Lp, self.Dr, self.Dp]) if show_gui: self.set_gui(center_camera)
def __init__(self): EzPickle.__init__(self) self.seed() self.viewer = None self.world = Box2D.b2World() self.moon = None self.lander = None self.particles = [] self.prev_reward = None # useful range is -1 .. +1, but spikes can be higher self.observation_space = spaces.Box(-np.inf, np.inf, shape=(8, )) if self.continuous: # Action is two floats [main engine, left-right engines]. # Main engine: -1..0 off, 0..+1 throttle from 50% to 100% power. Engine can't work with less than 50% power. # Left-right: -1.0..-0.5 fire left engine, +0.5..+1.0 fire right engine, -0.5..0.5 off self.action_space = spaces.Box(-1, +1, (2, )) else: # Nop, fire left engine, main engine, right engine self.action_space = spaces.Discrete(4) self.reset()
def __init__(self, randomize_tasks=False, n_tasks=2): if randomize_tasks: np.random.seed(1337) goals = [[np.random.uniform(-1., 1.), np.random.uniform(-1., 1.)] for _ in range(n_tasks)] else: # some hand-coded goals for debugging goals = [ np.array([10, -10]), np.array([10, 10]), np.array([-10, 10]), np.array([-10, -10]), np.array([0, 0]), np.array([7, 2]), np.array([0, 4]), np.array([-6, 9]) ] goals = [g / 10. for g in goals] self.goals = goals self.reset_task(0) self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(2, )) self.action_space = spaces.Box(low=-0.1, high=0.1, shape=(2, ))
def __init__(self, model_path='./square2d.xml', distance_threshold=1e-1, frame_skip=2, goal=[0.3, 0.3], horizon=100): if model_path.startswith("/"): fullpath = model_path else: fullpath = os.path.join(os.path.dirname(__file__), model_path) if not path.exists(fullpath): raise IOError("File %s does not exist" % fullpath) self.model = load_model_from_path(fullpath) self.seed() self.sim = MjSim(self.model) self.data = self.sim.data self.viewer = None self.distance_threshold = distance_threshold self.frame_skip = frame_skip self.set_goal_location(goal) self.reward_type = 'dense' self.horizon = horizon self.time_step = 0 obs = self.get_current_observation() self.action_space = spaces.Box(-1., 1., shape=(2, )) self.observation_space = spaces.Box(-np.inf, np.inf, shape=obs.shape)
def __init__(self, fence=False, *args, **kwargs): MODEL_CLASS = AntEnv if fence: MODEL_CLASS.FILE = 'ant1.xml' else: MODEL_CLASS.FILE = 'ant.xml' Serializable.quick_init(self, locals()) MazeEnv.__init__(self, *args, **kwargs) self._blank_maze = False # add goal obs self.blank_maze_obs = np.concatenate( [np.zeros(self._n_bins), np.zeros(self._n_bins)]) # self.blank_maze_obs = np.zeros(self._n_bins) # The following caches the spaces so they are not re-instantiated every time shp = self.get_current_obs().shape ub = BIG * np.ones(shp) self._observation_space = spaces.Box(ub * -1, ub) shp = self.get_current_robot_obs().shape ub = BIG * np.ones(shp) self._robot_observation_space = spaces.Box(ub * -1, ub) shp = self.get_current_maze_obs().shape ub = BIG * np.ones(shp) self._maze_observation_space = spaces.Box(ub * -1, ub)
def __init__( self, *args, **kwargs): Serializable.quick_init(self, locals()) MazeEnv.__init__(self, *args, **kwargs) self._blank_maze = False # add goal obs if self.direct_goal: self.blank_maze_obs = np.zeros(self._n_bins + 2) else: self.blank_maze_obs = np.concatenate([np.zeros(self._n_bins), np.zeros(self._n_bins)]) # self.blank_maze_obs = np.zeros(self._n_bins) # The following caches the spaces so they are not re-instantiated every time shp = self.get_current_obs().shape ub = BIG * np.ones(shp) self._observation_space = spaces.Box(ub * -1, ub) shp = self.get_current_robot_obs().shape ub = BIG * np.ones(shp) self._robot_observation_space = spaces.Box(ub * -1, ub) shp = self.get_current_maze_obs().shape ub = BIG * np.ones(shp) self._maze_observation_space = spaces.Box(ub * -1, ub)
def __init__(self, task={}, n_tasks=100, randomize_tasks=True, **kwargs): self.goals = self.sample_tasks(n_tasks) self.goal_radius = 0.09 self._goal = [0, 0, 0.01] super(ReacherGoalEnv_sparse, self).__init__() self.reset_task(0) Serializable.__init__(self) self.observation_space = spaces.Box(low=self.observation_space.low, high=self.observation_space.high) self.action_space = spaces.Box(low=self.action_space.low, high=self.action_space.high)
def action_space(self): bounds = self.model.actuator_ctrlrange lb = bounds[:, 0] ub = bounds[:, 1] self.action_space_ture = spaces.Box(lb, ub) # 我家的 bounds = np.array([ [-1, 1], ## TODO 我改的 [-1, 1] ]) lb = bounds[:, 0] ub = bounds[:, 1] return spaces.Box(lb, ub)
def __init__(self, dynamics_noise='no_noise', std=np.array([1, 1]), friction=0.0, obs_noise=False, obs_noise_bound=0.075): self.gravity = 9.8 self.masscart = 0.5 self.masspole = 0.5 self.total_mass = (self.masspole + self.masscart) self.length = 0.5 # actually half the pole's length self.polemass_length = (self.masspole * self.length) self.max_force = 10.0 self.tau = 0.02 # seconds between state updates self.kinematics_integrator = 'euler' self.dynamics_noise = dynamics_noise # if we have noise on the dynamics self.friction = friction self.obs_noise = obs_noise self.obs_noise_bound = obs_noise_bound if dynamics_noise not in [ 'no_noise', 'gaussian', 'gaussian_state_dependent', "gumbel" ]: raise ValueError("wrong noise") if type(std) == np.ndarray: # note: we divide here by tau to have N(s_{t+1}|s_{t} + delta_{t} * \mu(s_{t},a_{t}), \sigma) self.std = std / self.tau else: self.std = np.ones(2) * std / self.tau self.std_torch = torch.from_numpy(self.std).float() # Angle at which to fail the episode self.theta_threshold_radians = math.pi self.x_threshold = 3 # set a maximum velocity, s.t. we can clip too high velocities # Angle limit set to 2 * theta_threshold_radians so failing observation is still within bounds high = np.array([ self.x_threshold * 1.2, 1e2, self.theta_threshold_radians * 1.2, 1e2 ]) # continuous action space between -force_mag and force_mag self._action_space = spaces.Box(np.array([-self.max_force]), np.array([self.max_force])) self._observation_space = spaces.Box(-high, high) self.viewer = None self.state = None self.steps_beyond_done = None
def get_aug_obs_space(obs_space, variant): if variant['concat_type'] == 'concatenation': low = np.hstack([obs_space.low, np.full(variant['num_skills'], 0)]) high = np.hstack([obs_space.high, np.full(variant['num_skills'], 1)]) aug_obs_space = spaces.Box(low=low, high=high) return aug_obs_space elif variant['concat_type'] == 'bilinear': dim = obs_space.shape[0] * variant['num_skills'] low = np.ones(dim) * obs_space.low[0] high = np.ones(dim) * obs_space.high[0] aug_obs_space = spaces.Box(low=low, high=high) return aug_obs_space else: raise NotImplementedError
def action_space(self): lat_dim = self.low_policy_latent_dim if self.discrete_actions: return spaces.Discrete(lat_dim) # the action is now just a selection else: ub = 1e6 * np.ones(lat_dim) return spaces.Box(-1 * ub, ub)
def action_space(self): if isinstance(self._wrapped_env.action_space, Box): wrapped_low = np.append(self._wrapped_env.action_space.low, [-1]) wrapped_high = np.append(self._wrapped_env.action_space.high, [1]) return spaces.Box(wrapped_low, wrapped_high) else: raise NotImplementedError
def __init__(self, visit_axis_bound=None, *args, **kwargs): super(MujocoEnv_ObsInit, self).__init__(*args, **kwargs) shp = self.get_current_obs().shape ub = BIG * np.ones(shp) self._observation_space = spaces.Box(ub * -1, ub) self.visit_axis_bound = visit_axis_bound
def observation_space(self): if self.position_only: d = len(self._get_position_ids()) else: d = len(self.extra_data.states) ub = BIG * np.ones(d) return spaces.Box(ub * -1, ub)
def observation_space(self): dim = self.wrapped_env.observation_space.flat_dim if self.goal_vector_obs: newdim = dim + (self.n_apples + self.n_bombs) * 3 else: newdim = dim + self.n_bins * 2 ub = BIG * np.ones(newdim) return spaces.Box(ub * -1, ub)
def action_space(self): if (self.mode == 1): lower_bound = np.array([0, 0]) upper_bound = np.array([200, 200]) if (self.mode == 0): lower_bound = np.array([0, 0]) ######################### upper_bound = np.array([0.3, 0.3]) return spaces.Box(lower_bound, upper_bound)
def observation_space(self): if self._normalize_obs: if isinstance(self._wrapped_env.observation_space, Box): ub = np.ones(self._wrapped_env.observation_space.shape) return spaces.Box(-1 * ub, ub) return self._wrapped_env.observation_space else: return self._wrapped_env.observation_space
def __init__(self, env, num_skills, z): Serializable.quick_init(self, locals()) self._env = env self._num_skills = num_skills self._z = z obs_space = self._env.observation_space low = np.hstack([obs_space.low, np.full(num_skills, 0)]) high = np.hstack([obs_space.high, np.full(num_skills, 1)]) self.observation_space = spaces.Box(low=low, high=high) self.action_space = self._env.action_space self.spec = EnvSpec(self.observation_space, self.action_space)
def observation_space(self): # self.observation_space = spaces.Box(np.array([-1,-1,0,0,-2,-1]), np.array([1,1,1,1,2,1])) if self.normlize_obs: if self.use_nearby_obs: return spaces.Box(-1, 1, ((2 * self.sensor_range + 1)**2 + 2, )) else: return spaces.Box(-1, 1, (3 + 2 * self.n_key, )) else: if self.use_nearby_obs: return spaces.Box( np.array([0, 0, 0] + [0] * 24), np.array( [self._map.shape[0], self._map.shape[1], self.n_key] + [5] * ((2 * self.sensor_range + 1)**2 - 1))) else: return spaces.Box( np.array([0, 0, 0] + [0] * self.n_key), np.array( [self._map.shape[0], self._map.shape[1], self.n_key] + [self._map.shape[0], self._map.shape[1]] * self.n_key))
def inject_action_noise(self, action): ######################### bounds = self.model.actuator_ctrlrange lb = bounds[:, 0] ub = bounds[:, 1] action_space = spaces.Box(lb, ub) ######################原来没有 # generate action noise noise = self.action_noise * \ np.random.normal(size=action.shape) # rescale the noise to make it proportional to the action bounds #lb, ub = self.action_bounds # 原来有 lb, ub = action_space.bounds noise = 0.5 * (ub - lb) * noise return action + noise
def __init__(self, env, init_selector, goal_selector, append_goal=False, terminal_bonus=0, terminal_eps=0.1, distance_metric='L2', goal_reward='NegativeDistance', goal_weight=1, inner_weight=0, angle_idxs=(None, ), persistence=3): """ :param env: wrapped env NEEDS RESET WITH INIT_STATE arg! :param init_selector: already instantiated: NEEDS GOOD DIM OF GOALS! --> TO DO: give the class a method to update dim? :param terminal_bonus: if not 0, the rollout terminates with this bonus if the goal state is reached :param terminal_eps: eps around which the ter/minal goal is considered reached :param distance_metric: L1 or L2 or a callable func :param goal_reward: NegativeDistance or InverseDistance or callable func :param goal_weight: coef of the goal-dist reward :param inner_weight: coef of the inner reward """ Serializable.quick_init(self, locals()) self.append_goal = append_goal self.terminal_bonus = terminal_bonus self.terminal_eps = terminal_eps self._distance_metric = distance_metric self._goal_reward = goal_reward self.goal_weight = goal_weight self.inner_weight = inner_weight self.persistence = persistence self.persistence_count = 1 self.fig_number = 0 ProxyEnv.__init__(self, env) InitBaseAngle.__init__(self, angle_idxs=angle_idxs, init_selector=init_selector, goal_selector=goal_selector) ub = BIG * np.ones(self.get_current_obs().shape) self._observation_space = spaces.Box(ub * -1, ub) # keep around all sampled goals, one dict per training itr (ie, call of this env.log_diagnostics) self.inits_trained = []
def set_param_space(self, eps_scale=0.5): """ Set param_space @param param_space: dict(string, rllab.space.base.Space) """ params = {**self._get_param1(), **self._get_param2()} if not isinstance(eps_scale, list): eps_scale = np.ones(len(params)) * eps_scale self._param_space = dict() i = 0 for param, value in params.items(): eps = np.abs(value) * eps_scale[i] ub = value + eps lb = value - eps for name in positive_params: if name in param: lb = np.clip(lb, 1e-3, ub) break space = spaces.Box(lb, ub) self._param_space[param] = space i += 1
def maze_observation_space(self): shp = np.concatenate(self.get_readings()).shape ub = BIG * np.ones(shp) return spaces.Box(ub * -1, ub)
def observation_space(self): dim = self.inner_env.observation_space.flat_dim newdim = dim + self.n_bins * 2 ub = BIG * np.ones(newdim) return spaces.Box(ub * -1, ub)
def action_space(self): bounds = self.model.actuator_ctrlrange lb = bounds[:, 0] ub = bounds[:, 1] return spaces.Box(lb, ub)
def run_experiment(variant): if variant['env_name'] == 'humanoid-rllab': from rllab.envs.mujoco.humanoid_env import HumanoidEnv env = normalize(HumanoidEnv()) elif variant['env_name'] == 'swimmer-rllab': from rllab.envs.mujoco.swimmer_env import SwimmerEnv env = normalize(SwimmerEnv()) elif variant["env_name"] == "Point2D-v0": import sac.envs.point2d_env env = GymEnv(variant["env_name"]) else: env = normalize(GymEnv(variant['env_name'])) obs_space = env.spec.observation_space assert isinstance(obs_space, spaces.Box) low = np.hstack([obs_space.low, np.full(variant['num_skills'], 0)]) high = np.hstack([obs_space.high, np.full(variant['num_skills'], 1)]) aug_obs_space = spaces.Box(low=low, high=high) aug_env_spec = EnvSpec(aug_obs_space, env.spec.action_space) pool = SimpleReplayBuffer( env_spec=aug_env_spec, max_replay_buffer_size=variant['max_pool_size'], ) base_kwargs = dict(min_pool_size=variant['max_path_length'], epoch_length=variant['epoch_length'], n_epochs=variant['n_epochs'], max_path_length=variant['max_path_length'], batch_size=variant['batch_size'], n_train_repeat=variant['n_train_repeat'], eval_render=False, eval_n_episodes=1, eval_deterministic=True, sampler=SimpleSampler( max_path_length=variant["max_path_length"], min_pool_size=variant["max_path_length"], batch_size=variant["batch_size"])) M = variant['layer_size'] qf = NNQFunction( env_spec=aug_env_spec, hidden_layer_sizes=[M, M], ) vf = NNVFunction( env_spec=aug_env_spec, hidden_layer_sizes=[M, M], ) policy = GaussianPolicy( env_spec=aug_env_spec, hidden_layer_sizes=[M, M], reg=0.001, ) # policy = GMMPolicy( # env_spec=aug_env_spec, # K=variant['K'], # hidden_layer_sizes=[M, M], # qf=qf, # reg=0.001, # ) discriminator = NNDiscriminatorFunction( env_spec=env.spec, hidden_layer_sizes=[M, M], num_skills=variant['num_skills'], ) algorithm = DIAYN(base_kwargs=base_kwargs, env=env, policy=policy, discriminator=discriminator, pool=pool, qf=qf, vf=vf, lr=variant['lr'], scale_entropy=variant['scale_entropy'], discount=variant['discount'], tau=variant['tau'], num_skills=variant['num_skills'], save_full_state=False, include_actions=variant['include_actions'], learn_p_z=variant['learn_p_z'], add_p_z=variant['add_p_z'], reparametrize=variant["reparametrize"]) algorithm.train()
def observation_space(self): shp = self.get_current_image_obs()[1].shape # shp = self.get_current_image_obs()[1].shape ub = BIG * np.ones(shp) return spaces.Box(ub * -1, ub)
def maze_observation_space(self): shp = self.get_current_maze_obs().shape ub = BIG * np.ones(shp) return spaces.Box(ub * -1, ub)
def action_space(self): if isinstance(self._wrapped_env.action_space, Box): ub = np.ones(self._wrapped_env.action_space.shape) return spaces.Box(-1 * ub, ub) return self._wrapped_env.action_space
def action_space(self): lb = np.array( [control.ctrllimit[0] for control in self.extra_data.controls]) ub = np.array( [control.ctrllimit[1] for control in self.extra_data.controls]) return spaces.Box(lb, ub)