def create_ik_activation_setup(dt, max_steps, max_dist_force, physics_engine): # Set up environment env = Planar3LinkIKActivationSim( physicsEngine=physics_engine, dt=dt, max_steps=max_steps, max_dist_force=max_dist_force, taskCombinationMethod='product', positionTasks=True, checkJointLimits=False, collisionAvoidanceIK=True, observeTaskSpaceDiscrepancy=True, ) print_domain_params(env.domain_param) # Set up policy def policy_fcn(t: float): return [0.3 + 0.2 * math.sin(2. * math.pi * 0.2 * t), 0, 1] policy = TimePolicy(env.spec, policy_fcn, dt) # Simulate return rollout(env, policy, render_mode=RenderMode(video=True), stop_on_done=True)
def joint_control_variant(dt, max_steps, max_dist_force, physics_engine): # Set up environment env = Planar3LinkJointCtrlSim( physicsEngine=physics_engine, dt=dt, max_steps=max_steps, max_dist_force=max_dist_force, checkJointLimits=True, ) print_domain_params(env.domain_param) # Set up policy def policy_fcn(t: float): return [ 0.1, 0.1, # same as init config 0.1 + 45. / 180. * math.pi * math.sin(2. * math.pi * 0.2 * t) ] # oscillation in last link policy = TimePolicy(env.spec, policy_fcn, dt) # Simulate return rollout(env, policy, render_mode=RenderMode(video=True), stop_on_done=True)
def create_manual_activation_setup(dt, max_steps, max_dist_force, physics_engine): # Set up environment env = Planar3LinkTASim(physicsEngine=physics_engine, dt=dt, max_steps=max_steps, max_dist_force=max_dist_force, positionTasks=True, observeTaskSpaceDiscrepancy=True) print_domain_params(env.domain_param) # Set up policy def policy_fcn(t: float): pot = np.fromstring(input("Enter potentials for next step: "), dtype=np.double, count=3, sep=' ') return 1 / (1 + np.exp(-pot)) policy = TimePolicy(env.spec, policy_fcn, dt) # Simulate return rollout(env, policy, render_mode=RenderMode(video=True), stop_on_done=True)
def create_joint_control_setup(dt, max_steps, max_dist_force, physics_engine): # Set up environment env = Planar3LinkJointCtrlSim( physicsEngine=physics_engine, dt=dt, max_steps=max_steps, max_dist_force=max_dist_force, taskCombinationMethod="sum", checkJointLimits=True, ) print_domain_params(env.domain_param) # Set up policy def policy_fcn(t: float): return [ 10 / 180 * math.pi, 10 / 180 * math.pi, # same as init config 10 / 180 * math.pi + 45.0 / 180.0 * math.pi * math.sin(2.0 * math.pi * 0.2 * t), ] # oscillation in last link policy = TimePolicy(env.spec, policy_fcn, dt) # Simulate return rollout(env, policy, render_mode=RenderMode(video=True), stop_on_done=True)
def _compute_candidate(self, nc: int): """ Train and save one candidate solution to a pt-file :param nc: number of domains used for training the candidate solution """ # Do a warm start if desired self._subrtn_cand.init_modules( self.warmstart_cand, prefix=f"iter_{self._curr_iter - 1}", suffix="cand", policy_param_init=self.cand_policy_param_init, valuefcn_param_init=self.cand_critic_param_init, ) # Sample sets of physics params xi_{1}, ..., xi_{nc} self.env_dr.fill_buffer(nc) env_params_cand = self.env_dr.randomizer.get_params() joblib.dump( env_params_cand, osp.join(self.save_dir, f"iter_{self._curr_iter}_env_params_cand.pkl")) print("Randomized parameters of for the candidate solution:") print_domain_params(env_params_cand) # Reset the subroutine algorithm which includes resetting the exploration self._cnt_samples += self._subrtn_cand.sample_count self._subrtn_cand.reset() print("Reset candidate exploration noise.") pol_param_before = self._subrtn_cand.policy.param_values.clone() if isinstance(self._subrtn_cand, ActorCritic): # Set dropout and batch normalization layers to training mode self._subrtn_cand.critic.vfcn.train() critic_param_before = self._subrtn_cand.critic.vfcn.param_values.clone( ) # Solve the (approx) stochastic program SP_nc for the sampled physics parameter sets print_cbt(f"\nIteration {self._curr_iter} | Candidate solution\n", "c", bright=True) self._subrtn_cand.train(snapshot_mode="best", meta_info=dict( prefix=f"iter_{self._curr_iter}", suffix="cand")) if (self._subrtn_cand.policy.param_values == pol_param_before).all(): warn( "The candidate's policy parameters did not change during training!", UserWarning) if isinstance(self._subrtn_refs, ActorCritic): if (self._subrtn_cand.critic.vfcn.param_values == critic_param_before).all(): warn( "The candidate's critic parameters did not change during training!", UserWarning) print_cbt("Learned an approx solution for SP_nc.\n", "y")
def create_setup(physics_engine, dt, max_steps, max_dist_force): # Set up environment env = BallOnPlate5DSim(physicsEngine=physics_engine, dt=dt, max_steps=max_steps, max_dist_force=max_dist_force) env = ActNormWrapper(env) print_domain_params(env.domain_param) # Set up policy def policy_fcn(t: float): return [ 0.0, # x_ddot_plate 0.5 * math.sin(2. * math.pi * 5 * t), # y_ddot_plate 5. * math.cos(2. * math.pi / 5. * t), # z_ddot_plate 0.0, # alpha_ddot_plate 0.0, # beta_ddot_plate ] policy = TimePolicy(env.spec, policy_fcn, dt) return env, policy
def ik_control_variant(dt, max_steps, max_dist_force, physics_engine): # Set up environment env = Planar3LinkIKSim( physicsEngine=physics_engine, dt=dt, max_steps=max_steps, max_dist_force=max_dist_force, checkJointLimits=True, ) print_domain_params(env.domain_param) # Set up policy def policy_fcn(t: float): return [0.3 + 0.2 * math.sin(2. * math.pi * 0.2 * t), 1.1] policy = TimePolicy(env.spec, policy_fcn, dt) # Simulate return rollout(env, policy, render_mode=RenderMode(video=True), stop_on_done=True)
def __init__(self): ShowBase.__init__(self) self.done = False self.state = None self.param = None print("a") self.ro = rollout( env, policy, render_mode=RenderMode(text=args.verbose, video=args.animation), eval=True, max_steps=max_steps, stop_on_done=not args.relentless, reset_kwargs=dict(domain_param=self.param, init_state=self.state), ) print("hoi") print_domain_params(env.domain_param) print_cbt(f"Return: {self.ro.undiscounted_return()}", "g", bright=True) self.done, self.state, self.param = after_rollout_query( env, policy, self.ro) print("1") self.bob = BallOnBeamSim(2) print("2") self.pos, self.r_ball, self.a, self.l_beam, self.d_beam = self.bob._init_anim( ) print("3") self.ball = self.loader.loadModel("my_models/ball") self.ball.reparentTo(self.render) self.ball.setPos(self.pos) self.box = self.loader.loadModel("my_models/box") self.box.reparentTo(self.render) self.box.setPos(0, 0, 0) self.box.setScale(self.l_beam, self.d_beam, 2 * self.d_beam) self.camera.setPos(0, -10, 0)
def sim_policy_fixed_env(env: SimEnv, policy: Policy, domain_param: [dict, list]): """ Simulate (with animation) a rollout in a environment with fixed domain parameters. :param env: environment stack as it was used during training :param policy: policy to simulate :param domain_param: domain parameter set or a list of sets that specify the environment """ # Remove wrappers that make the rollouts stochastic env = remove_env(env, GaussianObsNoiseWrapper) env = remove_env(env, DomainRandWrapperBuffer) env = remove_env(env, DomainRandWrapperLive) # Initialize done, state, i = False, None, 0 if isinstance(domain_param, dict): param = domain_param elif isinstance(domain_param, list): param = domain_param[i] else: raise pyrado.TypeErr(given=domain_param, expected_type=[dict, list]) while not done: ro = rollout(env, policy, reset_kwargs=dict(domain_param=param, init_state=state), render_mode=RenderMode(video=True), eval=True) print_domain_params(env.domain_param) print_cbt(f'Return: {ro.undiscounted_return()}', 'g', bright=True) done, state, _ = after_rollout_query(env, policy, ro) if isinstance(domain_param, list): # Iterate over the list of domain parameter sets i = (i + 1) % len(domain_param) param = domain_param[i]
cand = to.load(osp.join(ex_dir, found_cands[i])).numpy() ax.scatter(np.arange(cand.size), cand, label='$\phi_{' + str(i) + '}$', c=f'C{i%10}', s=16) ax.xaxis.set_major_locator(MaxNLocator(integer=True)) ax.set_ylabel('parameter value') ax.set_xlabel('parameter index') plt.legend() plt.show() # Simulate for i in range(len(found_policies)): # Load current policy = to.load(osp.join(ex_dir, found_policies[i])) cand = to.load(osp.join(ex_dir, found_cands[i])) # Set the domain randomizer given the hyper-parameters if isinstance(env_sim, MetaDomainRandWrapper): env_sim.adapt_randomizer(cand) print_cbt(f'Set the domain randomizer to\n{env_sim.randomizer}', 'c') else: raise pyrado.TypeErr(given=env_sim, expected_type=MetaDomainRandWrapper) done, state, param = False, None, None while not done: print_cbt(f'Simulating {found_policies[i]} with associated domain parameter distribution.', 'g') ro = rollout(env_sim, policy, render_mode=RenderMode(video=True), eval=True, reset_kwargs=dict(domain_param=param, init_state=state)) # calls env.reset() print_domain_params(env_sim.domain_param) print_cbt(f'Return: {ro.undiscounted_return()}', 'g', bright=True) done, state, param = after_rollout_query(env_sim, policy, ro) pyrado.close_vpython()
""" Test predefined energy-based controller to make the Quanser Qube swing up. """ import torch as to from pyrado.environments.pysim.quanser_qube import QQubeSim from pyrado.domain_randomization.utils import print_domain_params from pyrado.policies.environment_specific import QQubeSwingUpAndBalanceCtrl from pyrado.sampling.rollout import rollout, after_rollout_query from pyrado.utils.data_types import RenderMode from pyrado.utils.input_output import print_cbt if __name__ == '__main__': # Set up environment env = QQubeSim(dt=1/500., max_steps=4000) # Set up policy policy = QQubeSwingUpAndBalanceCtrl(env.spec) # Simulate done, param, state = False, None, None while not done: ro = rollout(env, policy, render_mode=RenderMode(text=False, video=True), eval=True, reset_kwargs=dict(domain_param=param, init_state=state)) print_domain_params(env.domain_param) print_cbt(f'Return: {ro.undiscounted_return()}', 'g', bright=True) done, state, param = after_rollout_query(env, policy, ro)
def _compute_references(self, nr: int, nG: int): """ Train and save nG reference solutions to pt-files :param nr: number of domains used for training the reference solutions :param nG: number of reference solutions """ # Loop to compute a distribution of optimality gaps via nG samples for k in range(nG): print_cbt( f'Iteration {self._curr_iter} | Reference solution {k + 1} of {nG}\n', 'c', bright=True) if not self.warmstart_refs: # Create a new reference policy by re-initializing its parameters self._subrtn_cand.policy.init_param() # Create a new value function by re-initializing its parameters if isinstance(self._subrtn_refs, ActorCritic): self._subrtn_refs.critic.value_fcn.init_param() print_cbt('Created a new reference solution.\n', 'y') else: # Continue from the candidate's policy of the current iteration self._subrtn_refs.policy.load_state_dict( to.load( osp.join(self._save_dir, f'iter_{self._curr_iter}_policy_cand.pt')). state_dict()) if not (self._subrtn_refs.policy.param_values == self._subrtn_cand.policy.param_values).all(): warn( "The reference policy's parameters are not equal to the candidate's after loading them!" "This can be explained by snapshot_mode='best'", UserWarning) # Continue from the candidate's value function of the current iteration if isinstance(self._subrtn_cand, ActorCritic) and isinstance( self._subrtn_refs, ActorCritic): self._subrtn_refs.critic.value_fcn.load_state_dict( to.load( osp.join( self._save_dir, f'iter_{self._curr_iter}_valuefcn_cand.pt')). state_dict()) print_cbt( 'Initialized the reference solution with the previously trained candidate solution.\n', 'y') # Sample new sets of physics params xi_{k,1}, ..., xi_{k,nr} self._env_dr.fill_buffer(nr) env_params_ref = self._env_dr.randomizer.get_params() joblib.dump( env_params_ref, osp.join(self._save_dir, f'iter_{self._curr_iter}_env_params_ref_{k}.pkl')) print( 'Randomized parameters of for the current reference solution:') print_domain_params(env_params_ref) # Reset the subroutine algorithm which includes resetting the exploration self._subrtn_refs.reset() print_cbt('Reset reference exploration noise.', 'y') if isinstance(self._subrtn_refs, ActorCritic): # Set dropout and batch normalization layers to training mode self._subrtn_refs.critic.value_fcn.train() critic_param_before = self._subrtn_refs.critic.value_fcn.param_values.clone( ) # Solve the (approx) stochastic program SP_n for the samples physics parameter sets pol_param_before = self._subrtn_refs.policy.param_values.clone() self._subrtn_refs.train(snapshot_mode='best', meta_info=dict( prefix=f'iter_{self._curr_iter}', suffix=f'ref_{k}')) if (self._subrtn_refs.policy.param_values == pol_param_before ).all(): warn( "The reference's policy parameters did not change during training!", UserWarning) if isinstance(self._subrtn_refs, ActorCritic): if (self._subrtn_refs.critic.value_fcn.param_values == critic_param_before).all(): warn( "The reference's critic parameters did not change during training!", UserWarning) print_cbt('Learned an approx solution for SP_n\n', 'y')
def _compute_candidate(self, nc: int): """ Train and save one candidate solution to a pt-file :param nc: number of domains used for training the candidate solution """ if self._curr_iter == 0 or not self.warmstart_cand: # Create a new candidate policy by re-initializing its parameters self._subrtn_cand.policy.init_param(self.cand_policy_param_init) # Create a new value function by re-initializing its parameters if isinstance(self._subrtn_cand, ActorCritic): self._subrtn_cand.critic.value_fcn.init_param( self.cand_critic_param_init) print_cbt('Created a new candidate solution.\n', 'y') elif self._curr_iter > 0 and self.warmstart_cand: # Continue from the candidate's policy of the previous iteration self._subrtn_cand.policy.load_state_dict( to.load( osp.join(self._save_dir, f'iter_{self._curr_iter - 1}_policy_cand.pt')). state_dict()) # Continue from the candidate's value function of the previous iteration if isinstance(self._subrtn_cand, ActorCritic): self._subrtn_cand.critic.value_fcn.load_state_dict( to.load( osp.join( self._save_dir, f'iter_{self._curr_iter - 1}_valuefcn_cand.pt')). state_dict()) print_cbt( 'Initialized the candidate solution with the previously trained candidate.\n', 'y') else: raise pyrado.ValueErr( msg= 'Faulty joint configuration of curr_iter and warmstart_cand!') # Sample sets of physics params xi_{1}, ..., xi_{nc} self._env_dr.fill_buffer(nc) env_params_cand = self._env_dr.randomizer.get_params() joblib.dump( env_params_cand, osp.join(self._save_dir, f'iter_{self._curr_iter}_env_params_cand.pkl')) print('Randomized parameters of for the candidate solution:') print_domain_params(env_params_cand) # Reset the subroutine algorithm which includes resetting the exploration self._subrtn_cand.reset() print('Reset candidate exploration noise.') if isinstance(self._subrtn_cand, ActorCritic): # Set dropout and batch normalization layers to training mode self._subrtn_cand.critic.value_fcn.train() critic_param_before = self._subrtn_cand.critic.value_fcn.param_values.clone( ) # Solve the (approx) stochastic program SP_nc for the sampled physics parameter sets print_cbt(f'\nIteration {self._curr_iter} | Candidate solution\n', 'c', bright=True) pol_param_before = self._subrtn_cand.policy.param_values.clone() self._subrtn_cand.train(snapshot_mode='best', meta_info=dict( prefix=f'iter_{self._curr_iter}', suffix='cand')) if (self._subrtn_cand.policy.param_values == pol_param_before).all(): warn( "The candidate's policy parameters did not change during training!", UserWarning) if isinstance(self._subrtn_refs, ActorCritic): if (self._subrtn_cand.critic.value_fcn.param_values == critic_param_before).all(): warn( "The candidate's critic parameters did not change during training!", UserWarning) print_cbt('Learned an approx solution for SP_nc.\n', 'y')
def _compute_references(self, nr: int, nG: int): """ Train and save nG reference solutions to pt-files :param nr: number of domains used for training the reference solutions :param nG: number of reference solutions """ # Loop to compute a distribution of optimality gaps via nG samples for k in range(nG): print_cbt( f'Iteration {self._curr_iter} | Reference solution {k + 1} of {nG}\n', 'c', bright=True) # Do a warm start if desired self._subrtn_refs.init_modules( self.warmstart_refs, prefix=f'iter_{self._curr_iter}', suffix='cand', policy_param_init=self.cand_policy_param_init, valuefcn_param_init=self.cand_critic_param_init) # Sample new sets of physics params xi_{k,1}, ..., xi_{k,nr} self.env_dr.fill_buffer(nr) env_params_ref = self.env_dr.randomizer.get_params() joblib.dump( env_params_ref, osp.join(self.save_dir, f'iter_{self._curr_iter}_env_params_ref_{k}.pkl')) print( 'Randomized parameters of for the current reference solution:') print_domain_params(env_params_ref) # Reset the subroutine algorithm which includes resetting the exploration self._cnt_samples += self._subrtn_refs.sample_count self._subrtn_refs.reset() print_cbt('Reset reference exploration noise.', 'y') pol_param_before = self._subrtn_refs.policy.param_values.clone() if isinstance(self._subrtn_refs, ActorCritic): # Set dropout and batch normalization layers to training mode self._subrtn_refs.critic.vfcn.train() critic_param_before = self._subrtn_refs.critic.vfcn.param_values.clone( ) # Solve the (approx) stochastic program SP_n for the samples physics parameter sets self._subrtn_refs.train(snapshot_mode='best', meta_info=dict( prefix=f'iter_{self._curr_iter}', suffix=f'ref_{k}')) if (self._subrtn_refs.policy.param_values == pol_param_before ).all(): warn( "The reference's policy parameters did not change during training!", UserWarning) if isinstance(self._subrtn_refs, ActorCritic): if (self._subrtn_refs.critic.vfcn.param_values == critic_param_before).all(): warn( "The reference's critic parameters did not change during training!", UserWarning) print_cbt('Learned an approx solution for SP_n\n', 'y')