def test_actor_critic(ex_dir, env: SimEnv, policy: Policy, algo, algo_hparam, vfcn_type, use_cuda): pyrado.set_seed(0) if use_cuda: policy._device = "cuda" policy = policy.to(device="cuda") # Create value function if vfcn_type == "fnn-plain": vfcn = FNN( input_size=env.obs_space.flat_dim, output_size=1, hidden_sizes=[16, 16], hidden_nonlin=to.tanh, use_cuda=use_cuda, ) elif vfcn_type == FNNPolicy.name: vf_spec = EnvSpec(env.obs_space, ValueFunctionSpace) vfcn = FNNPolicy(vf_spec, hidden_sizes=[16, 16], hidden_nonlin=to.tanh, use_cuda=use_cuda) elif vfcn_type == RNNPolicy.name: vf_spec = EnvSpec(env.obs_space, ValueFunctionSpace) vfcn = RNNPolicy(vf_spec, hidden_size=16, num_recurrent_layers=1, use_cuda=use_cuda) else: raise NotImplementedError # Create critic critic_hparam = dict( gamma=0.98, lamda=0.95, batch_size=32, lr=1e-3, standardize_adv=False, ) critic = GAE(vfcn, **critic_hparam) # Common hyper-parameters common_hparam = dict(max_iter=2, min_rollouts=3, num_workers=1) # Add specific hyper parameters if any common_hparam.update(algo_hparam) # Create algorithm and train algo = algo(ex_dir, env, policy, critic, **common_hparam) algo.train() assert algo.curr_iter == algo.max_iter
def create_flipping_task( env_spec: EnvSpec, obs_labels: Sequence[str], des_angle_delta: float = np.pi / 2.0, endless: bool = True, ) -> MaskedTask: """ Create a task for rotating an object. .. note:: This task was designed with an RcsPySim environment in mind, but is not restricted to these environments. :param env_spec: environment specification :param obs_labels: labels for selection, e.g. ['Box_A']. This needs to match the observations' names in RcsPySim :param des_angle_delta: desired angle to rotate. If reached, the task is reset, and rotating continues. :param endless: if `True`, the task will promote endlessly repeated flipping about the desired angle, else only one flip is desired :return: masked task that only considers a subspace of all observations """ # Get the masked environment specification spec = EnvSpec( env_spec.obs_space, env_spec.act_space, env_spec.state_space.subspace( env_spec.state_space.create_mask(obs_labels)), ) # Create a desired state task rew_fcn = CosOfOneEleRewFcn(idx=0) task = FlippingTask(spec, des_angle_delta, rew_fcn, endless=endless) # Return the masked tasks return MaskedTask(env_spec, task, obs_labels)
def create_home_pos_task(env_spec: EnvSpec, obs_labels: Sequence[str], state_des: np.ndarray) -> MaskedTask: """ Create a task for moving the robot to safe position. .. note:: This task was designed with an RcsPySim environment in mind, but is not restricted to these environments. :param env_spec: environment specification :param obs_labels: labels for selection, e.g. ['PowerGrasp_R_Y', 'PowerGrasp_R_Z']. This needs to match the observations' names in RcsPySim :param state_des: desired state (depends of the coordinate system). If reached, the task is over. :return: masked task that only considers a subspace of all observations """ # Get the masked environment specification spec = EnvSpec( env_spec.obs_space, env_spec.act_space, env_spec.state_space.subspace( env_spec.state_space.create_mask(obs_labels)), ) # Create a desired state task Q = 1e1 * np.eye(len(state_des)) R = 1e-1 * np.eye(spec.act_space.flat_dim) rew_fcn = ExpQuadrErrRewFcn(Q, R) task = DesStateTask(spec, state_des, rew_fcn) # Return the masked tasks return MaskedTask(env_spec, task, obs_labels)
def create_goal_dist_task(env_spec: EnvSpec, ds_label: int, rew_fcn: RewFcn, succ_thold: float = 0.01) -> MaskedTask: """ Create a task that rewards minimizing the `GoalDistance` of dynamical system movements primitives (see RcsPySim). .. note:: This task was designed with an RcsPySim environment in mind, but is not restricted to these environments. :param env_spec: environment specification :param ds_label: label of the dynamical system (see RcsPySim) :param rew_fcn: reward function :param succ_thold: once the object of interest is closer than this threshold, the task is considered successfully :return: masked task that only considers a subspace of all observations """ # Define the indices for selection. This needs to match the observations' names in RcsPySim. obs_labels = [f"GD_DS{ds_label}"] # Get the masked environment specification spec = EnvSpec( env_spec.obs_space, env_spec.act_space, env_spec.state_space.subspace( env_spec.state_space.create_mask(obs_labels)) if env_spec.state_space is not EmptySpace else EmptySpace, ) # Create a desired state task with the goal [0, 0] task = DesStateTask( spec, np.zeros(2), rew_fcn, functools.partial(proximity_succeeded, thold_dist=succ_thold)) # Mask selected goal distance return MaskedTask(env_spec, task, obs_labels)
def create_forcemin_task(env_spec: EnvSpec, obs_labels: Sequence[str], Q: np.ndarray) -> MaskedTask: """ Create a task which punishes the amount of used force. .. note:: This task was designed with an RcsPySim environment in mind, but is not restricted to these environments. :param env_spec: environment specification :param obs_labels: labels for selection, e.g. ['WristLoadCellLBR_R_Fy']. This needs to match the observations' names in RcsPySim :param Q: weight matrix of dim NxN with N=num_forces for the quadratic force costs :return: masked task that only considers a subspace of all observations """ # Get the masked environment specification spec = EnvSpec( env_spec.obs_space, env_spec.act_space, env_spec.state_space.subspace( env_spec.state_space.create_mask(obs_labels)), ) # Create an endlessly running desired state task rew_fcn = QuadrErrRewFcn(Q=Q, R=np.zeros((spec.act_space.flat_dim, spec.act_space.flat_dim))) task = DesStateTask(spec, np.zeros(spec.state_space.shape), rew_fcn, never_succeeded) # Mask selected collision cost observation return MaskedTask(env_spec, task, obs_labels)
def create_collision_task(env_spec: EnvSpec, factor: float) -> MaskedTask: """ Create a task which punishes collision costs given a collision model with pairs of bodies. This task only looks at the instantaneous collision cost. .. note:: This task was designed with an RcsPySim environment in mind, but is not restricted to these environments. :param env_spec: environment specification :param factor: cost / reward function scaling factor :return: masked task that only considers a subspace of all observations """ if not factor >= 0: raise pyrado.ValueErr(given=factor, ge_constraint="0") # Define the indices for selection. This needs to match the observations' names in RcsPySim. obs_labels = ["CollCost"] # Get the masked environment specification spec = EnvSpec( env_spec.obs_space, env_spec.act_space, env_spec.state_space.subspace( env_spec.state_space.create_mask(obs_labels)), ) rew_fcn = AbsErrRewFcn(q=np.array([factor]), r=np.zeros(spec.act_space.shape)) # Create an endlessly running desired state task (no collision is desired) task = DesStateTask(spec, np.zeros(spec.state_space.shape), rew_fcn, never_succeeded) # Mask selected collision cost observation return MaskedTask(env_spec, task, obs_labels)
def create_box_lift_task(env_spec: EnvSpec, continuous_rew_fcn: bool, succ_thold: float): # Define the indices for selection. This needs to match the observations' names in RcsPySim. idcs = ['Box_Z'] # Get the masked environment specification spec = EnvSpec( env_spec.obs_space, env_spec.act_space, env_spec.state_space.subspace(env_spec.state_space.create_mask(idcs))) # Create a desired state task # state_des = np.array([0.3]) # box position is measured relative to the table state_des = np.array([1.1]) # box position is measured world coordinates if continuous_rew_fcn: Q = np.diag([3e1]) R = 1e0 * np.eye(spec.act_space.flat_dim) rew_fcn = ExpQuadrErrRewFcn(Q, R) else: rew_fcn = MinusOnePerStepRewFcn() dst = DesStateTask( spec, state_des, rew_fcn, functools.partial(proximity_succeeded, thold_dist=succ_thold)) # Return the masked tasks return MaskedTask(env_spec, dst, idcs)
def create_task_space_discrepancy_task(env_spec: EnvSpec, rew_fcn: RewFcn) -> MaskedTask: """ Create a task which punishes the discrepancy between the actual and the commanded state of the observed body. The observed body is specified in in the associated experiment configuration file in RcsPySim. This task only looks at the X and Z coordinates. :param env_spec: environment specification :param rew_fcn: reward function :return: masked task """ # Define the indices for selection. This needs to match the observations' names in RcsPySim. idcs = [idx for idx in env_spec.state_space.labels if 'DiscrepTS' in idx] if not idcs: raise pyrado.ValueErr( msg="No state space labels found that contain 'DiscrepTS'") # Get the masked environment specification spec = EnvSpec( env_spec.obs_space, env_spec.act_space, env_spec.state_space.subspace(env_spec.state_space.create_mask(idcs))) # Create a desired state task (no task space discrepancy is desired and the task never stops because of success) dst = DesStateTask(spec, np.zeros(spec.state_space.shape), rew_fcn, never_succeeded) # Mask selected discrepancy observation return MaskedTask(env_spec, dst, idcs)
def reset(self, env_spec: EnvSpec, **kwargs): self._env_spec = env_spec # Determine the masks if self._state_idcs is not None: self._state_mask = env_spec.state_space.create_mask( self._state_idcs) else: self._state_mask = np.ones(env_spec.state_space.shape, dtype=np.bool_) if self._action_idcs is not None: self._action_mask = env_spec.act_space.create_mask( self._action_idcs) else: self._action_mask = np.ones(env_spec.act_space.shape, dtype=np.bool_) # Pass masked state and masked action self._wrapped_task.reset( env_spec=EnvSpec( env_spec.obs_space, env_spec.act_space.subspace(self._action_mask), env_spec.state_space.subspace(self._state_mask) if env_spec.state_space is not EmptySpace else EmptySpace, ), **kwargs, )
def create_box_flip_task(env_spec: EnvSpec, continuous_rew_fcn): # Define the indices for selection. This needs to match the observations' names in RcsPySim. idcs = ['Box_A'] # Get the masked environment specification spec = EnvSpec( env_spec.obs_space, env_spec.act_space, env_spec.state_space.subspace(env_spec.state_space.create_mask(idcs)) ) # Create a desired state task # state_des = np.array([0.3]) # box position is measured relative to the table state_des = np.array([-np.pi/2]) # box position is measured world coordinates if continuous_rew_fcn: # q = np.array([0./np.pi]) # r = 1e-6*np.ones(spec.act_space.flat_dim) # rew_fcn_act = AbsErrRewFcn(q, r) rew_fcn = CosOfOneEleRewFcn(idx=0) # rew_fcn = CompoundRewFcn([rew_fcn_act, rew_fcn_box]) else: rew_fcn = MinusOnePerStepRewFcn() ef_task = EndlessFlippingTask(spec, rew_fcn, init_angle=0.) # Return the masked tasks return MaskedTask(env_spec, ef_task, idcs)
def __init__(self, env_spec: EnvSpec, batch_size: int, reward_multiplier: float, lr: float = 3e-3, logger: StepLogger = None, device: str = 'cuda' if to.cuda.is_available() else 'cpu'): """ Constructor :param env_spec: environment specification :param batch_size: batch size for each update step :param reward_multiplier: factor for the predicted probability :param lr: learning rate :param logger: logger for every step of the algorithm, if `None` the default logger will be created """ self.device = device self.batch_size = batch_size self.reward_multiplier = reward_multiplier self.lr = lr spec = EnvSpec(obs_space=BoxSpace.cat( [env_spec.obs_space, env_spec.obs_space, env_spec.act_space]), act_space=BoxSpace(bound_lo=[0], bound_up=[1])) self.discriminator = FNNPolicy(spec=spec, hidden_nonlin=to.tanh, hidden_sizes=[62], output_nonlin=to.sigmoid) self.loss_fcn = nn.BCELoss() self.optimizer = to.optim.Adam(self.discriminator.parameters(), lr=lr, eps=1e-5) self.logger = logger
def create_box_upper_shelve_task(env_spec: EnvSpec, continuous_rew_fcn: bool, succ_thold: float): # Define the indices for selection. This needs to match the observations' names in RcsPySim. idcs = ['Box_X', 'Box_Y', 'Box_Z', 'Box_A', 'Box_B', 'Box_C'] # Get the masked environment specification spec = EnvSpec( env_spec.obs_space, env_spec.act_space, env_spec.state_space.subspace(env_spec.state_space.create_mask(idcs))) # Create a desired state task state_des = np.zeros( 6) # zeros since we observe the box position relative to the goal if continuous_rew_fcn: Q = np.diag([5e0, 5e0, 5e0, 1e-1, 1e-1, 1e-1]) R = 5e-2 * np.eye(spec.act_space.flat_dim) rew_fcn = ExpQuadrErrRewFcn(Q, R) else: rew_fcn = MinusOnePerStepRewFcn dst = DesStateTask( spec, state_des, rew_fcn, functools.partial(proximity_succeeded, thold_dist=succ_thold)) # Return the masked tasks return MaskedTask(env_spec, dst, idcs)
def create_collision_task(env_spec: EnvSpec, factor: float) -> MaskedTask: """ Create a task which punishes collision costs given a collision model with pairs of bodies. This task only looks at the instantaneous collision cost. :param env_spec: environment specification :param factor: cost / reward function scaling factor :return: masked task """ if not factor >= 0: raise pyrado.ValueErr(given=factor, ge_constraint='0') # Define the indices for selection. This needs to match the observations' names in RcsPySim. idcs = ['CollCost'] # Get the masked environment specification spec = EnvSpec( env_spec.obs_space, env_spec.act_space, env_spec.state_space.subspace(env_spec.state_space.create_mask(idcs))) rew_fcn = AbsErrRewFcn(q=np.array([factor]), r=np.zeros(spec.act_space.shape)) # Create a desired state task (no collision is desired and the task never stops because of success) dst = DesStateTask(spec, np.zeros(spec.state_space.shape), rew_fcn, never_succeeded) # Mask selected collision cost observation return MaskedTask(env_spec, dst, idcs)
def create_nonrecurrent_policy(): return LinearPolicy( EnvSpec( BoxSpace(-1, 1, 4), BoxSpace(-1, 1, 3), ), FeatureStack(const_feat, identity_feat, squared_feat), )
def create_recurrent_policy(): return RNNPolicy( EnvSpec( BoxSpace(-1, 1, 4), BoxSpace(-1, 1, 3), ), hidden_size=32, num_recurrent_layers=1, hidden_nonlin='tanh' )
def test_sprl(ex_dir, env: SimEnv, optimize_mean: bool): pyrado.set_seed(0) env = ActNormWrapper(env) env_sprl_params = [ dict( name="gravity_const", target_mean=to.tensor([9.81]), target_cov_chol_flat=to.tensor([1.0]), init_mean=to.tensor([9.81]), init_cov_chol_flat=to.tensor([0.05]), ) ] radnomizer = DomainRandomizer( *[SelfPacedDomainParam(**p) for p in env_sprl_params]) env = DomainRandWrapperLive(env, randomizer=radnomizer) policy = FNNPolicy(env.spec, hidden_sizes=[64, 64], hidden_nonlin=to.tanh) vfcn_hparam = dict(hidden_sizes=[32, 32], hidden_nonlin=to.relu) vfcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam) critic_hparam = dict( gamma=0.9844534412010116, lamda=0.9710614403461155, num_epoch=10, batch_size=150, standardize_adv=False, lr=0.00016985313083236645, ) critic = GAE(vfcn, **critic_hparam) subrtn_hparam = dict( max_iter=1, eps_clip=0.12648736789309026, min_steps=10 * env.max_steps, num_epoch=3, batch_size=150, std_init=0.7573286998997557, lr=6.999956625305722e-04, max_grad_norm=1.0, num_workers=1, ) algo_hparam = dict( kl_constraints_ub=8000, performance_lower_bound=500, std_lower_bound=0.4, kl_threshold=200, max_iter=1, optimize_mean=optimize_mean, ) algo = SPRL(env, PPO(ex_dir, env, policy, critic, **subrtn_hparam), **algo_hparam) algo.train(snapshot_mode="latest") assert algo.curr_iter == algo.max_iter
def test_modulated_rew_fcn(): Q = np.eye(4) R = np.eye(2) s = np.array([1, 2, 3, 4]) a = np.array([0, 0]) # Modulo 2 for all selected states idcs = [0, 1, 3] rew_fcn = QuadrErrRewFcn(Q, R) task = RadiallySymmDesStateTask(EnvSpec(None, None, None), np.zeros(4), rew_fcn, idcs, 2) r = task.step_rew(s, a) assert r == -(1**2 + 3**2) # Different modulo factor for the selected states idcs = [1, 3] task = RadiallySymmDesStateTask(EnvSpec(None, None, None), np.zeros(4), rew_fcn, idcs, np.array([2, 3])) r = task.step_rew(s, a) assert r == -(1**2 + 3**2 + 1**2)
def _create_main_task(self, task_args: dict) -> Task: # Create a DesStateTask that masks everything but the ball position idcs = list( range(self.state_space.flat_dim - 6, self.state_space.flat_dim - 3)) # Cartesian ball position spec = EnvSpec( self.spec.obs_space, self.spec.act_space, self.spec.state_space.subspace( self.spec.state_space.create_mask(idcs))) # If we do not use copy(), state_des coming from MuJoCo is a reference and updates automatically at each step. # Note: sim.forward() + get_body_xpos() results in wrong output for state_des, as sim has not been updated to # init_space.sample(), which is first called in reset() if task_args.get('sparse_rew_fcn', False): factor = task_args.get('success_bonus', 1) # Binary final reward task main_task = FinalRewTask(ConditionOnlyTask( spec, condition_fcn=self.check_ball_in_cup, is_success_condition=True), mode=FinalRewMode(always_positive=True), factor=factor) # Yield -1 on fail after the main task ist done (successfully or not) dont_fail_after_succ_task = FinalRewTask( GoallessTask(spec, ZeroPerStepRewFcn()), mode=FinalRewMode(always_negative=True), factor=factor) # Augment the binary task with an endless dummy task, to avoid early stopping task = SequentialTasks((main_task, dont_fail_after_succ_task)) return MaskedTask(self.spec, task, idcs) else: state_des = self.sim.data.get_site_xpos( 'cup_goal') # this is a reference R_default = np.diag([ 0, 0, 1, 1e-2, 1e-2, 1e-1 ]) if self.num_dof == 7 else np.diag([0, 0, 1e-2, 1e-2]) rew_fcn = ExpQuadrErrRewFcn( Q=task_args.get('Q', np.diag([ 2e1, 1e-4, 2e1 ])), # distance ball - cup; shouldn't move in y-direction R=task_args.get('R', R_default) # last joint is really unreliable ) task = DesStateTask(spec, state_des, rew_fcn) # Wrap the masked DesStateTask to add a bonus for the best state in the rollout return BestStateFinalRewTask(MaskedTask(self.spec, task, idcs), max_steps=self.max_steps, factor=task_args.get( 'final_factor', 0.05))
def test_pddr(ex_dir, env: SimEnv, policy, algo_hparam): pyrado.set_seed(0) # Create algorithm and train teacher_policy = deepcopy(policy) critic = GAE( vfcn=FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), hidden_sizes=[16, 16], hidden_nonlin=to.tanh)) teacher_algo_hparam = dict(critic=critic, min_steps=1500, max_iter=2) teacher_algo = PPO # Wrapper randomizer = create_default_randomizer(env) env = DomainRandWrapperLive(env, randomizer) # Subroutine algo_hparam = dict( max_iter=2, min_steps=env.max_steps, std_init=0.15, num_epochs=10, num_teachers=2, teacher_policy=teacher_policy, teacher_algo=teacher_algo, teacher_algo_hparam=teacher_algo_hparam, num_workers=1, ) algo = PDDR(ex_dir, env, policy, **algo_hparam) algo.train() assert algo.curr_iter == algo.max_iter # Save and load algo.save_snapshot(meta_info=None) algo_loaded = Algorithm.load_snapshot(load_dir=ex_dir) assert isinstance(algo_loaded, Algorithm) policy_loaded = algo_loaded.policy # Check assert all(algo.policy.param_values == policy_loaded.param_values) # Load the experiment. Since we did not save any hyper-parameters, we ignore the errors when loading. env, policy, extra = load_experiment(ex_dir) assert isinstance(env, Env) assert isinstance(policy, Policy) assert isinstance(extra, dict)
def test_arpl(ex_dir, env: SimEnv): pyrado.set_seed(0) env = ActNormWrapper(env) env = StateAugmentationWrapper(env, domain_param=None) policy = FNNPolicy(env.spec, hidden_sizes=[16, 16], hidden_nonlin=to.tanh) vfcn_hparam = dict(hidden_sizes=[32, 32], hidden_nonlin=to.tanh) vfcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **vfcn_hparam) critic_hparam = dict( gamma=0.9844534412010116, lamda=0.9710614403461155, num_epoch=10, batch_size=150, standardize_adv=False, lr=0.00016985313083236645, ) critic = GAE(vfcn, **critic_hparam) algo_hparam = dict( max_iter=2, min_steps=23 * env.max_steps, min_rollouts=None, num_epoch=5, eps_clip=0.085, batch_size=150, std_init=0.995, lr=2e-4, num_workers=1, ) arpl_hparam = dict( max_iter=2, steps_num=23 * env.max_steps, halfspan=0.05, dyn_eps=0.07, dyn_phi=0.25, obs_phi=0.1, obs_eps=0.05, proc_phi=0.1, proc_eps=0.03, torch_observation=True, ) ppo = PPO(ex_dir, env, policy, critic, **algo_hparam) algo = ARPL(ex_dir, env, ppo, policy, ppo.expl_strat, **arpl_hparam) algo.train(snapshot_mode="best")
def create_task_space_discrepancy_task(env_spec: EnvSpec, rew_fcn: RewFcn) -> MaskedTask: # Define the indices for selection. This needs to match the observations' names in RcsPySim. idcs = ['DiscrepTS_Y', 'DiscrepTS_Z'] # Get the masked environment specification spec = EnvSpec( env_spec.obs_space, env_spec.act_space, env_spec.state_space.subspace(env_spec.state_space.create_mask(idcs)) ) # Create a desired state task (no task space discrepancy is desired and the task never stops because of success) dst = DesStateTask(spec, np.zeros(spec.state_space.shape), rew_fcn, never_succeeded) # Mask selected discrepancy observation return MaskedTask(env_spec, dst, idcs)
def __init__( self, mapping: Dict[int, Tuple[str, str]], trafo_mask: Union[list, to.Tensor], prior: DomainRandomizer = None, scale_params: bool = False, use_cuda: bool = False, ): """ Constructor :param mapping: mapping from subsequent integers to domain distribution parameters, where the first string of the value tuple is the name of the domain parameter (e.g. mass, length) and the second string is the name of the distribution parameter (e.g. mean, std). The integers are indices of the numpy array which come from the algorithm. :param trafo_mask: every domain parameter that is set to `True` in this mask will be learned via a 'virtual' parameter, i.e. in sqrt-space, and then finally squared to retrieve the domain parameter. This transformation is useful to avoid setting a negative variance. :param prior: prior believe about the distribution parameters in from of a `DomainRandomizer` :param scale_params: if `True`, the sqrt-transformed policy parameters are scaled in the range of $[-0.5, 0.5]$. The advantage of this is to make the parameter-based exploration easier. :param use_cuda: `True` to move the policy to the GPU, `False` (default) to use the CPU """ if not isinstance(mapping, dict): raise pyrado.TypeErr(given=mapping, expected_type=dict) if not len(trafo_mask) == len(mapping): raise pyrado.ShapeErr(given=trafo_mask, expected_match=mapping) self._mapping = mapping self.mask = to.tensor(trafo_mask, dtype=to.bool) self.prior = prior self._scale_params = scale_params # Define the parameter space by using the Policy.env_spec.act_space param_spec = EnvSpec(obs_space=EmptySpace(), act_space=BoxSpace(-pyrado.inf, pyrado.inf, shape=len(mapping))) # Call Policy's constructor super().__init__(param_spec, use_cuda) self.params = nn.Parameter(to.empty(param_spec.act_space.flat_dim), requires_grad=True) if self._scale_params: self.param_scaler = None # initialized during init_param() self.init_param(prior=prior)
def _create_task(self, task_args: dict) -> Task: # Create a DesStateTask that masks everything but the ball position idcs = list( range(self.state_space.flat_dim - 3, self.state_space.flat_dim)) # Cartesian ball position spec = EnvSpec( self.spec.obs_space, self.spec.act_space, self.spec.state_space.subspace( self.spec.state_space.create_mask(idcs))) # If we do not use copy(), state_des coming from MuJoCo is a reference and updates automatically at each step. # Note: sim.forward() + get_body_xpos() results in wrong output for state_des, as sim has not been updated to # init_space.sample(), which is first called in reset() if task_args.get('sparse_rew_fcn', False): # Binary final reward task task = FinalRewTask(ConditionOnlyTask( spec, condition_fcn=self.check_ball_in_cup, is_success_condition=True), mode=FinalRewMode(always_positive=True), factor=1) return MaskedTask(self.spec, task, idcs) else: # If we do not use copy(), state_des is a reference to passed body and updates automatically at each step state_des = self.sim.data.get_site_xpos( 'cup_goal') # this is a reference rew_fcn = ExpQuadrErrRewFcn( Q=task_args.get('Q', np.diag([ 1e1, 1e5, 2e1 ])), # distance ball - cup; shouldn't move in y-direction R=task_args.get('R', np.diag([ 1e-1, 1e-1, 1e-1, 1e-2, 1e-2, 1e-2 ])) # desired joint angles and velocities ) task = DesStateTask(spec, state_des, rew_fcn) # Wrap the masked DesStateTask to add a bonus for the best state in the rollout return BestStateFinalRewTask(MaskedTask(self.spec, task, idcs), max_steps=self.max_steps, factor=task_args.get( 'final_factor', 1.))
def create_extract_slider_task(env_spec: EnvSpec, task_args: dict, des_state: np.ndarray): # Define the indices for selection. This needs to match the observations' names in RcsPySim. idcs = ['Slider_Y'] # Get the masked environment specification spec = EnvSpec( env_spec.obs_space, env_spec.act_space, env_spec.state_space.subspace(env_spec.state_space.create_mask(idcs))) # Create a desired state task Q = task_args.get('Q_slider', np.array([[4e1]])) R = task_args.get('R_slider', 1e-6 * np.eye(spec.act_space.flat_dim)) rew_fcn = ExpQuadrErrRewFcn(Q, R) dst_task = DesStateTask(spec, des_state, rew_fcn) # Return the masked tasks return MaskedTask(env_spec, dst_task, idcs)
def test_adr(env, ex_dir, subrtn_hparam, actor_hparam, value_fcn_hparam, critic_hparam, adr_hparam): # Create the subroutine for the meta-algorithm actor = FNNPolicy(spec=env.spec, **actor_hparam) value_fcn = FNNPolicy(spec=EnvSpec(env.obs_space, ValueFunctionSpace), **value_fcn_hparam) critic = GAE(value_fcn, **critic_hparam) subroutine = PPO(ex_dir, env, actor, critic, **subrtn_hparam) # Create algorithm and train particle_hparam = dict(actor=actor_hparam, value_fcn=value_fcn_hparam, critic=critic_hparam) algo = ADR(ex_dir, env, subroutine, svpg_particle_hparam=particle_hparam, **adr_hparam) algo.train() assert algo.curr_iter == algo.max_iter
def create_insert_task(env_spec: EnvSpec, state_des: np.ndarray, rew_fcn: RewFcn, success_fcn: Callable): # Define the indices for selection. This needs to match the observations' names in RcsPySim. idcs = [ 'Effector_X', 'Effector_Z', 'Effector_B', 'Effector_Xd', 'Effector_Zd', 'Effector_Bd' ] # Get the masked environment specification spec = EnvSpec( env_spec.obs_space, env_spec.act_space, env_spec.state_space.subspace(env_spec.state_space.create_mask(idcs))) # Create a wrapped desired state task with the goal behind the wall fdst = FinalRewTask(DesStateTask(spec, state_des, rew_fcn, success_fcn), mode=FinalRewMode(state_dependent=True, time_dependent=True)) # Mask selected states return MaskedTask(env_spec, fdst, idcs)
def create_goal_dist_distvel_task(env_spec: EnvSpec, ds_index: int, rew_fcn: RewFcn, succ_thold: float = 0.01) -> MaskedTask: # Define the indices for selection. This needs to match the observations' names in RcsPySim. idcs = [f'GD_DS{ds_index}', f'GD_DS{ds_index}d'] # Get the masked environment specification spec = EnvSpec( env_spec.obs_space, env_spec.act_space, env_spec.state_space.subspace(env_spec.state_space.create_mask(idcs)) if env_spec.state_space is not EmptySpace else EmptySpace) # Create a desired state task with the goal [0, 0] dst = DesStateTask( spec, np.zeros(2), rew_fcn, functools.partial(proximity_succeeded, thold_dist=succ_thold)) # Mask selected goal distance velocities return MaskedTask(env_spec, dst, idcs)
def test_actor_critic(env, linear_policy, ex_dir, algo, algo_hparam, value_fcn_type, use_cuda): # Create value function if value_fcn_type == 'fnn-plain': value_fcn = FNN(input_size=env.obs_space.flat_dim, output_size=1, hidden_sizes=[16, 16], hidden_nonlin=to.tanh, use_cuda=use_cuda) else: vf_spec = EnvSpec(env.obs_space, ValueFunctionSpace) if value_fcn_type == 'fnn': value_fcn = FNNPolicy(vf_spec, hidden_sizes=[16, 16], hidden_nonlin=to.tanh, use_cuda=use_cuda) else: value_fcn = RNNPolicy(vf_spec, hidden_size=16, num_recurrent_layers=1, use_cuda=use_cuda) # Create critic critic_hparam = dict( gamma=0.98, lamda=0.95, batch_size=32, lr=1e-3, standardize_adv=False, ) critic = GAE(value_fcn, **critic_hparam) # Common hyper-parameters common_hparam = dict(max_iter=3, min_rollouts=3, num_sampler_envs=1) # Add specific hyper parameters if any common_hparam.update(algo_hparam) # Create algorithm and train algo = algo(ex_dir, env, linear_policy, critic, **common_hparam) algo.train() assert algo.curr_iter == algo.max_iter
def test_rff_regression(ex_dir, num_feat_per_dim: int, loss_fcn: Callable, algo_hparam: dict): # Generate some data inputs = to.linspace(-4.0, 4.0, 8001).view(-1, 1) targets = noisy_nonlin_fcn(inputs, f=3.0, noise_std=0).view(-1, 1) # Create the policy rff = RFFeat(inp_dim=1, num_feat_per_dim=num_feat_per_dim, bandwidth=1 / 20) policy = LinearPolicy( EnvSpec(InfBoxSpace(shape=(1, )), InfBoxSpace(shape=(1, ))), FeatureStack(rff)) # Create the algorithm, and train loss_before = loss_fcn(policy(inputs), targets) algo = NonlinRegression(ex_dir, inputs, targets, policy, **algo_hparam) algo.train() loss_after = loss_fcn(policy(inputs), targets) assert loss_after < loss_before assert algo.curr_iter >= algo_hparam["max_iter_no_improvement"]
def _create_deviation_task(self, task_args: dict) -> Task: idcs = list( range(self.state_space.flat_dim - 3, self.state_space.flat_dim)) # Cartesian cup goal position spec = EnvSpec( self.spec.obs_space, self.spec.act_space, self.spec.state_space.subspace( self.spec.state_space.create_mask(idcs))) # init cup goal position state_des = np.array([ 0.82521, 0, 1.4469 ]) if self.num_dof == 7 else np.array([0.758, 0, 1.5]) rew_fcn = QuadrErrRewFcn( Q=task_args.get('Q_dev', np.diag( [2e-1, 1e-4, 5e0])), # Cartesian distance from init position R=task_args.get( 'R_dev', np.zeros( (self._act_space.shape[0], self._act_space.shape[0])))) task = DesStateTask(spec, state_des, rew_fcn) return MaskedTask(self.spec, task, idcs)