Beispiel #1
0
    def create_init_conditions(self):
        change_print_color.change('MAGENTA')
        print("\nCreating Initial Conditions...")
        initial_cond = self.task_params['init_cond']

        ddof = 3  # Data dof (file): x, y, theta
        pdof = 3  # Pose dof (env): x, y, theta
        ntgt = self.task_params['ntargets']

        for cc, cond in enumerate(initial_cond):
            condition = np.zeros(self.env.obs_dim)
            condition[:self.env.action_dim] = np.deg2rad(cond[:3])
            cond_idx = 2*self.env.action_dim + pdof  # EE pose will be obtained from sim
            data_idx = self.env.action_dim
            for tt in range(self.task_params['ntargets']):
                tgt_data = cond[data_idx:data_idx+ddof]
                # tgt_pose = create_quat_pose(pos_x=tgt_data[0],
                #                             pos_y=tgt_data[1],
                #                             pos_z=z_fix,
                #                             rot_yaw=np.deg2rad(tgt_data[2]))
                # condition[cond_idx:cond_idx+pdof] = tgt_pose
                tgt_data[2] = np.deg2rad(tgt_data[2])
                condition[cond_idx:cond_idx+pdof] = tgt_data
                cond_idx += pdof
                data_idx += ddof

            self.env.add_init_cond(condition)

        return self.env.get_conditions()
Beispiel #2
0
    def create_environment(self):
        """Instantiate an specific RL environment to interact with.

        Returns:
            RL environment

        """
        change_print_color.change('BLUE')
        print("\nCreating Environment...")

        # Environment parameters
        env_with_img = False
        rdn_tgt_pos = False
        render = self.task_params['render']
        obs_like_mjc = self.task_params['obs_like_mjc']
        ntargets = self.task_params['ntargets']
        tgt_weights = self.task_params['tgt_weights']
        tgt_positions = self.task_params['tgt_positions']
        tgt_types = self.task_params['tgt_types']
        env = Pusher3DofBulletEnv(render=render, obs_with_img=env_with_img,
                                  obs_mjc_gym=obs_like_mjc, ntargets=ntargets,
                                  rdn_tgt_pos=rdn_tgt_pos, tgt_types=tgt_types)

        env.set_tgt_cost_weights(tgt_weights)
        env.set_tgt_pos(tgt_positions)

        print("Environment:%s OK!." % type(env).__name__)

        return env
Beispiel #3
0
    def test_policy(self, pol_type=None, condition=0, iteration=-1):
        noise = np.zeros((self.task_params['T'], self.agent.act_dim))

        if iteration == -1:
            for rr in range(600):
                temp_path = self.hyperparams['log_dir'] + ('/itr_%02d' % rr)
                if os.path.exists(temp_path):
                    iteration += 1

        if iteration == -1:
            print("There is not itr_XX data in '%s'" %
                  self.hyperparams['log_dir'])
            return False

        dir_path = 'itr_%02d/' % iteration

        itr_data_file = dir_path + 'iteration_data_itr_%02d.pkl' % iteration

        change_print_color.change('BLUE')
        print("\nLoading iteration data '%s'..." % itr_data_file)

        itr_data = self.learn_algo.data_logger.unpickle(itr_data_file)
        policy = itr_data[condition].traj_distr

        self.env.reset(condition=condition)
        input('Press a key to start sampling...')
        sample = self.agent.sample(self.env,
                                   condition,
                                   self.task_params['T'],
                                   self.task_params['Ts'],
                                   noise,
                                   policy=policy,
                                   save=False)
        return True
Beispiel #4
0
    def create_agent(self):
        change_print_color.change('CYAN')
        print("\nCreating Agent...")

        agent = NoPolAgent(act_dim=self.action_dim, obs_dim=self.obs_dim,
                           state_dim=self.state_dim,
                           agent_name='agent'+str('%02d' %
                                                  self.hyperparams['run_num']))
        print("Agent:%s OK\n" % type(agent).__name__)

        return agent
Beispiel #5
0
    def create_agent(self):
        """Instantiate the RL agent who interacts with the environment.

        Returns:
            RL agent

        """
        change_print_color.change('CYAN')
        print("\nCreating Agent...")

        policy_params = {
            'network_model': tf_network,  # tf_network, multi_modal_network, multi_modal_network_fp
            'network_params': {
                'n_layers': 2,  # Number of Hidden layers
                'dim_hidden': [32, 32],  # List of size per n_layers
                'obs_names': self.env.get_obs_info()['names'],
                'obs_dof': self.env.get_obs_info()['dimensions'],  # DoF for observation data tensor
            },
            # Initialization.
            'init_var': 0.1,  # Initial policy variance.
            'ent_reg': 0.0,  # Entropy regularizer (Used to update policy variance)
            # Solver hyperparameters.
            'iterations': self.task_params['tf_iterations'],  # Number of iterations per inner iteration (Default:5000).
            'batch_size': 15,
            'lr': 0.001,  # Base learning rate (by default it's fixed).
            'lr_policy': 'fixed',  # Learning rate policy.
            'momentum': 0.9,  # Momentum.
            'weight_decay': 0.005,  # Weight decay to prevent overfitting.
            'solver_type': 'Adam',  # Solver type (e.g. 'SGD', 'Adam', 'RMSPROP', 'MOMENTUM', 'ADAGRAD').
            # GPU usage.
            'use_gpu': self.task_params['use_gpu'],  # Whether or not to use the GPU for training.
            'gpu_id': 0,
            'gpu_mem_percentage': self.task_params['gpu_mem_percentage'],
            # Training data.
            'fc_only_iterations': 0,  # Iterations of only FC before normal training
            # Others.
            'random_seed': self.hyperparams['seed'] \
                if self.task_params['tf_seed'] == -1 \
                else self.task_params['tf_seed'],  # TF random seed
            'log_dir': self.hyperparams['log_dir'],
            # 'weights_file_prefix': EXP_DIR + 'policy',
        }

        policy_opt = {'type': PolicyOptTf, 'hyperparams': policy_params}

        agent = GPSAgent(act_dim=self.action_dim,
                         obs_dim=self.obs_dim,
                         state_dim=self.state_dim,
                         policy_opt=policy_opt,
                         agent_name='agent' +
                         str('%02d' % self.hyperparams['run_num']))
        print("Agent:%s OK\n" % type(agent).__name__)

        return agent
Beispiel #6
0
    def train(self, itr_load=None):
        """Train the RL agent with the learning algorithm.

        Args:
            itr_load: Iteration number with which to start

        Returns:
            bool: True for success, False otherwise.

        """
        change_print_color.change('WHITE')
        return self.learn_algo.run(itr_load)
Beispiel #7
0
    def test_policy(self, pol_type=None, condition=0, iteration=-1):
        """Test the RL agent using the policy learned in the specificied
        iteration in the specific condition.

        Args:
            pol_type: 'global' or 'local'
            condition: Condition number to test the agent
            iteration: Iteration to test the agent

        Returns:
            bool: True for success, False otherwise.

        """
        noise = np.zeros((self.task_params['T'], self.agent.act_dim))

        if iteration == -1:
            for rr in range(600):
                temp_path = self.hyperparams['log_dir'] + ('/itr_%02d' % rr)
                if os.path.exists(temp_path):
                    iteration += 1

        if iteration == -1:
            print("There is not itr_XX data in '%s'" %
                  self.hyperparams['log_dir'])
            return False

        dir_path = 'itr_%02d/' % iteration

        itr_data_file = dir_path + 'iteration_data_itr_%02d.pkl' % iteration

        change_print_color.change('BLUE')
        print("\nLoading iteration data '%s'..." % itr_data_file)

        itr_data = self.learn_algo.data_logger.unpickle(itr_data_file)
        policy = itr_data[condition].traj_distr

        stop = False
        while stop is False:
            self.env.reset(condition=condition)
            input('Press a key to start sampling...')
            sample = self.agent.sample(self.env,
                                       condition,
                                       self.task_params['T'],
                                       self.task_params['Ts'],
                                       noise,
                                       policy=policy,
                                       save=False)
            answer = input('Execute again. Write (n/N) to stop:')
            if answer.lower() in ['n']:
                stop = True

        return True
Beispiel #8
0
    def create_agent(self):
        """Instantiate the RL agent who interacts with the environment.

        Returns:
            RL agent

        """
        change_print_color.change('CYAN')
        print("\nCreating Agent...")

        agent = NoPolAgent(act_dim=self.action_dim, obs_dim=self.obs_dim,
                           state_dim=self.state_dim,
                           agent_name='agent'+str('%02d' %
                                                  self.hyperparams['run_num']))
        print("Agent:%s OK\n" % type(agent).__name__)

        return agent
Beispiel #9
0
    def create_environment(self):
        """Instantiate an specific RL environment to interact with.

        Returns:
            RL environment

        """
        change_print_color.change('BLUE')
        print("\nCreating Environment...")

        dt = self.task_params['Ts']

        # Create a CENTAURO ROS EnvInterface
        env = CentauroDrillEnv(dt=dt)

        print("Environment:%s OK!." % type(env).__name__)

        return env
RH_name = 'RWrMot3'
l_soft_hand_offset = np.array([0.000, -0.030, -0.210])
r_soft_hand_offset = np.array([0.000, 0.030, -0.210])

touching_drill_config = np.array([
    0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.0568, 0.2386,
    -0.2337, -1.6803, 0.2226, 0.0107, 0.5633, 0., 0., 0.0568, -0.2386, 0.2337,
    -1.6803, -0.2226, 0.0107, -0.5633
])

# ################### #
# ################### #
# ### ENVIRONMENT ### #
# ################### #
# ################### #
change_print_color.change('BLUE')
print("\nCreating Bigman environment...")

# Robot configuration
interface = 'ros'
body_part_active = 'RA'
body_part_sensed = 'RA'
command_type = 'effort'

if body_part_active == 'RA':
    hand_y = -drill_size[1] / 2 - 0.02
    hand_z = drill_size[2] / 2 + 0.02
    hand_name = RH_name
    hand_offset = r_soft_hand_offset
else:
    hand_y = drill_size[1] / 2 + 0.02
Beispiel #11
0
    def create_learning_algo(self):
        change_print_color.change('YELLOW')
        print("\nConfiguring learning algorithm...\n")

        # Learning params
        resume_training_itr = None  # Resume from previous training iteration

        # Dynamics
        learned_dynamics = {'type': DynamicsLRPrior,
                            'regularization': 1e-6,
                            'prior': {
                                'type': DynamicsPriorGMM,
                                'max_clusters': 20,  # Maximum number of clusters to fit.
                                'min_samples_per_cluster': 40,  # Minimum samples per cluster.
                                'max_samples': 20,  # Max. number of trajectories to use for fitting the GMM at any given time.
                                'strength': 1.0,  # Adjusts the strength of the prior.
                            },
                            }

        init_traj_distr = {'type': init_pd,
                           'init_var': np.array([1.0, 1.0, 1.0])*5.0e-01,
                           'pos_gains': 0.001,  # float or array
                           'vel_gains_mult': 0.01,  # Velocity gains multiplier on pos_gains
                           'init_action_offset': None,
                           'dJoints': self.env.get_total_joints(),  # Total joints in state
                           'state_to_pd': 'joints',  # Joints
                           'dDistance': 6,
                           }

        # Trajectory Optimization Method
        traj_opt_method = {
            'type': DualistTrajOpt,
            'good_const': self.task_params['consider_good'],  # Use good constraints
            'bad_const': self.task_params['consider_bad'],  # Use bad constraints
            'del0': 1e-4,  # Eta updates for non-SPD Q-function (non-SPD correction step).
            'del0_good': 1e-4,  # Omega updates for non-SPD Q-function (non-SPD correction step).
            'del0_bad': 1e-8,  # Nu updates for non-SPD Q-function (non-SPD correction step).
            # 'eta_error_threshold': 1e16, # TODO: REMOVE, it is not used
            'min_eta': 1e-8,  # At min_eta, kl_div > kl_step
            'max_eta': 1e16,  # At max_eta, kl_div < kl_step
            'min_omega': 1e-8,  # At min_omega, kl_div > kl_step
            'max_omega': self.task_params['max_omega'],  #1e16,  # At max_omega, kl_div < kl_step
            'min_nu': 1e-8,  # At min_nu, kl_div > kl_step
            'max_nu': self.task_params['max_nu'],  # At max_nu, kl_div < kl_step,
            'step_tol': 0.1,
            'bad_tol': 0.1,
            'good_tol': 0.1,
            'cons_per_step': False,  # Whether or not to enforce separate KL constraints at each time step. #TODO: IF TRUE, MAYBE IT DOES WORK WITH MDGPS because it doesn't consider dual vars
            'use_prev_distr': False,  # Whether or not to measure expected KL under the previous traj distr.
            'update_in_bwd_pass': True,  # Whether or not to update the TVLG controller during the bwd pass.
            'adam_alpha': 0.5,
            'adam_max_iter': 500,
            'weight_bad': self.task_params['weight_bad'],
            'weight_good': self.task_params['weight_good'],
            }

        good_trajs = None
        bad_trajs = None
        dualtrajopt_hyperparams = {
            'inner_iterations': self.task_params['inner_iterations'],  # Times the trajectories are updated
            # G/B samples selection | Fitting
            'good_samples': good_trajs,  # Good samples demos
            'bad_samples': bad_trajs,  # Bad samples demos
            'n_good_samples': self.task_params['n_good_samples'],  # Number of good samples per each trajectory
            'n_bad_samples': self.task_params['n_bad_samples'],  # Number of bad samples per each trajectory
            'n_good_buffer': self.task_params['n_good_buffer'],  # Number of good samples in the buffer
            'n_bad_buffer': self.task_params['n_bad_buffer'],  # Number of bad samples in the buffer
            'good_traj_selection_type': self.task_params['good_traj_selection_type'],  # 'always', 'only_traj'
            'bad_traj_selection_type': self.task_params['bad_traj_selection_type'],  # 'always', 'only_traj'
            'duality_dynamics_type': 'duality',  # Samples to use to update the dynamics 'duality', 'iteration'
            # Initial dual variables
            'init_eta': 0.1,#4.62,
            'init_nu': 0.1,
            'init_omega': 0.1,
            # KL step (epsilon)
            'step_rule': 'laplace',  # Whether to use 'laplace' or 'mc' cost in step adjustment
            'kl_step': self.task_params['kl_step'],  # Kullback-Leibler step (base_step)
            'min_step_mult': 0.01,  # Min possible value of step multiplier (multiplies kl_step)
            'max_step_mult': 10.0,  # Max possible value of step multiplier (multiplies kl_step)
            # KL bad (xi)
            'kl_bad': self.task_params['kl_bad'], #4.2  # Xi KL base value | kl_div_b >= kl_bad
            'min_bad_mult': 0.01,  # Min possible value of step multiplier (multiplies base_kl_bad)
            'max_bad_mult': 10.0,  # Max possible value of step multiplier (multiplies base_kl_bad)
            # KL good (chi)
            'kl_good': self.task_params['kl_good'],  #2.0,  # Chi KL base value  | kl_div_g <= kl_good
            'min_good_mult': 0.01,  # Min possible value of step multiplier (multiplies base_kl_good)
            'max_good_mult': 10.0,  # Max possible value of step multiplier (multiplies base_kl_good)
            # LinearPolicy 'projection'
            'init_pol_wt': 0.01,  # TODO: remove need for init_pol_wt in MDGPS (It should not work with MDGPS)
            'policy_sample_mode': 'add',  # Mode to update dynamics prior (Not used in ConstantPolicyPrior)
            'policy_prior': {'type': ConstantPolicyPrior,
                             'strength': 1e-4,
                             },
            'min_bad_var': np.array([3.0, 3.0, 3.0])*1.0e-02,
            'min_good_var': np.array([3.0, 3.0, 3.0])*1.0e-02,
            # TEMP Hyperparams
            'min_bad_rel_diff': self.task_params['min_bad_rel_diff'],
            'max_bad_rel_diff': self.task_params['max_bad_rel_diff'],
            'mult_bad_rel_diff': self.task_params['mult_bad_rel_diff'],
            'good_fix_rel_multi': self.task_params['good_fix_rel_multi'],
            }

        gps_hyperparams = {
            'T': self.task_params['T'],  # Total points
            'dt': self.task_params['Ts'],
            'iterations': self.task_params['iterations'],  # GPS episodes --> K iterations
            # Samples
            'num_samples': self.task_params['num_samples'],  # Samples for exploration trajs --> N samples
            'noisy_samples': True,
            'seed': self.task_params['seed'],
            'smooth_noise': True,  # Apply Gaussian filter to noise generated
            'smooth_noise_var': 5.0e+0,  # np.power(2*Ts, 2), # Variance to apply to Gaussian Filter. In Kumar (2016) paper, it is the std dev of 2 Ts
            'smooth_noise_renormalize': True,  # Renormalize smooth noise to have variance=1
            'noise_var_scale': 1.e-1*np.ones(self.action_dim),  # Scale to Gaussian noise: N(0, 1)*sqrt(noise_var_scale), only if smooth_noise_renormalize
            # Cost
            'cost': self.cost,
            # Conditions
            'conditions': len(self.init_cond),  # Total number of initial conditions
            'train_conditions': self.task_params['train_cond'],  # Indexes of conditions used for training
            # TrajDist
            'init_traj_distr': init_traj_distr,
            'fit_dynamics': True,
            'dynamics': learned_dynamics,
            'initial_state_var': 1e-2,  # Max value for x0sigma in trajectories
            # TrajOpt
            'traj_opt': traj_opt_method,
            'max_ent_traj': 0.0,  # Weight of maximum entropy term in trajectory optimization #TODO: CHECK THIS VALUE
            # Others
            'algo_hyperparams': dualtrajopt_hyperparams,
            'data_files_dir': self.hyperparams['log_dir'],
        }

        return DualTrajOpt(self.agent, self.env, **gps_hyperparams)
Beispiel #12
0
    def create_cost(self):
        change_print_color.change('GRAY')
        print("\nCreating Costs...")

        # Action Cost
        weight = 1e0  # 1e-4
        target = None
        act_cost = {
            'type': CostAction,
            'wu': np.ones(self.action_dim) * weight,
            'target': target,   # Target action value
        }

        # # FK Cost
        # fk_l1_cost = {
        #     'type': CostFK,
        #     'ramp_option': RAMP_CONSTANT,  # How target cost ramps over time. RAMP_* :CONSTANT, LINEAR, QUADRATIC, FINAL_ONLY
        #     'target_pose': target_distance_hand,
        #     'tgt_data_type': 'state',  # 'state' or 'observation'
        #     'tgt_idx': bigman_env.get_state_info(name='distance_hand')['idx'],
        #     'op_point_name': hand_name,
        #     'op_point_offset': hand_offset,
        #     'joints_idx': bigman_env.get_state_info(name='link_position')['idx'],
        #     'joint_ids': bigman_params['joint_ids'][body_part_active],
        #     'robot_model': robot_model,
        #     'wp': np.array([3.0, 3.0]),  # one dim less because 'quat' error | 1)orient 2)pos
        #     'evalnorm': evall1l2term,
        #     #'evalnorm': evallogl2term,
        #     'l1': 1.0,  # 1.0,  # 1.0,  # Weight for l1 norm: log(d^2 + alpha) --> Lorentzian rho-function Precise placement at the target
        #     'l2': 0.0,  # 1.0,  #1.0e-3,  # Weight for l2 norm: d^2 --> Encourages to quickly get the object in the vicinity of the target
        #     'alpha': 1.0e-5,  # e-5,  # Constant added in square root in l1 norm
        #     'wp_final_multiplier': 1,  # 10
        # }
        #
        # fk_l2_cost = {
        #     'type': CostFK,
        #     'ramp_option': RAMP_CONSTANT,  # How target cost ramps over time. RAMP_* :CONSTANT, LINEAR, QUADRATIC, FINAL_ONLY
        #     'target_pose': target_distance_hand,
        #     'tgt_data_type': 'state',  # 'state' or 'observation'
        #     'tgt_idx': bigman_env.get_state_info(name='distance_hand')['idx'],
        #     'op_point_name': hand_name,
        #     'op_point_offset': hand_offset,
        #     'joints_idx': bigman_env.get_state_info(name='link_position')['idx'],
        #     'joint_ids': bigman_params['joint_ids'][body_part_active],
        #     'robot_model': robot_model,
        #     # 'wp': np.array([1.0, 1.0, 1.0, 0.7, 0.8, 0.6]),  # one dim less because 'quat' error | 1)orient 2)pos
        #     #'wp': np.array([1.0, 1.0, 1.0, 3.0, 3.0, 3.0]),  # one dim less because 'quat' error | 1)orient 2)pos
        #     'wp': np.array([3.0, 3.0, 3.0, 3.0, 3.0, 3.0]),  # one dim less because 'quat' error | 1)orient 2)pos
        #     'evalnorm': evall1l2term,
        #     #'evalnorm': evallogl2term,
        #     'l1': 0.0,  # 1.0,  # 1.0,  # Weight for l1 norm: log(d^2 + alpha) --> Lorentzian rho-function Precise placement at the target
        #     'l2': 1.0,  # 1.0,  #1.0e-3,  # Weight for l2 norm: d^2 --> Encourages to quickly get the object in the vicinity of the target
        #     'alpha': 1.0e-5,  # e-5,  # Constant added in square root in l1 norm
        #     'wp_final_multiplier': 1,  # 10
        # }


        # State costs
        target_distance_object = np.zeros(2)
        # input(self.env.get_state_info())
        # input(self.env.get_state_info(name='tgt0')['idx'])
        state_cost_distance = {
            'type': CostState,
            'ramp_option': RAMP_CONSTANT,  # How target cost ramps over time. RAMP_* :CONSTANT, LINEAR, QUADRATIC, FINAL_ONLY
            'evalnorm': evall1l2term,  # TODO: ALWAYS USE evall1l2term
            'l1': 1.0,  # Weight for l1 norm
            'l2': 1.0,  # Weight for l2 norm
            'alpha': 1e-5,  # Constant added in square root in l1 norm
            'wp_final_multiplier': 1.0,  # Weight multiplier on final time step.
            'data_types': {
                'tgt0': {
                    'wp': np.array([1.0, 1.0]),  # State weights - must be set.
                    'target_state': target_distance_object,  # Target state - must be set.
                    'average': None,
                    'data_idx': self.env.get_state_info(name='tgt0')['idx']
                },
            },
        }

        state_final_cost_distance = {
            'type': CostState,
            'ramp_option': RAMP_FINAL_ONLY,  # How target cost ramps over time. RAMP_* :CONSTANT, LINEAR, QUADRATIC, FINAL_ONLY
            'evalnorm': evall1l2term,  # TODO: ALWAYS USE evall1l2term
            'l1': 1.0,  # Weight for l1 norm
            'l2': 1.0,  # Weight for l2 norm
            'alpha': 1e-5,  # Constant added in square root in l1 norm
            'wp_final_multiplier': 1.0,  # Weight multiplier on final time step.
            'data_types': {
                'tgt0': {
                    'wp': np.array([1.0, 1.0]),  # State weights - must be set.
                    'target_state': target_distance_object,  # Target state - must be set.
                    'average': None,
                    'data_idx': self.env.get_state_info(name='tgt0')['idx']
                },
            },
        }

        cost_safe_distance = {
            'type': CostSafeDistance,
            'ramp_option': RAMP_CONSTANT,  # How target cost ramps over time.
            'wp_final_multiplier': 1.0,  # Weight multiplier on final time step.
            'data_types': {
                'tgt1': {
                    'wp': np.array([1.0, 1.0]),  # State weights - must be set.
                    'safe_distance': np.array([0.15, 0.15]),
                    'outside_cost': np.array([0.0, 0.0]),
                    'inside_cost': np.array([1.0, 1.0]),
                    'data_idx': self.env.get_state_info(name='tgt1')['idx']
                },
            },
        }

        cost_state_difference = {
            'type': CostStateDifference,
            'ramp_option': RAMP_CONSTANT,  # How target cost ramps over time. RAMP_* :CONSTANT, LINEAR, QUADRATIC, FINAL_ONLY
            'evalnorm': evall1l2term,  # TODO: ALWAYS USE evall1l2term
            'l1': 1.e-0,  # Weight for l1 norm
            'l2': 5.e-2,  # Weight for l2 norm
            'alpha': 1e-10,  # Constant added in square root in l1 norm
            'wp_final_multiplier': 1.0,  # Weight multiplier on final time step.
            'data_types': {
                'ee': {
                    'wp': np.array([1.0, 1.0, 0.6]),  # State weights - must be set.
                    'target_state': 'tgt0',  # Target state - must be set.
                    'average': None,
                    'tgt_idx': self.env.get_state_info(name='tgt0')['idx'],
                    'data_idx': self.env.get_state_info(name='ee')['idx'],
                    'idx_to_use': [0, 1, 2],  # All: X, Y, theta
                },
            },
        }

        cost_final_state_difference = {
            'type': CostStateDifference,
            'ramp_option': RAMP_FINAL_ONLY,  # How target cost ramps over time. RAMP_* :CONSTANT, LINEAR, QUADRATIC, FINAL_ONLY
            'evalnorm': evall1l2term,  # TODO: ALWAYS USE evall1l2term
            'l1': 1.e-0,  # Weight for l1 norm
            'l2': 5.e-2,  # Weight for l2 norm
            'alpha': 1e-10,  # Constant added in square root in l1 norm
            'wp_final_multiplier': 1.0,  # Weight multiplier on final time step.
            'data_types': {
                'ee': {
                    'wp': np.array([1.0, 1.0, 0.6]),  # State weights - must be set.
                    'target_state': 'tgt0',  # Target state - must be set.
                    'average': None,
                    'tgt_idx': self.env.get_state_info(name='tgt0')['idx'],
                    'data_idx': self.env.get_state_info(name='ee')['idx'],
                    'idx_to_use': [0, 1, 2],  # All: X, Y, theta
                },
            },
        }

        safe_radius = 0.15
        cost_safe_state_difference = {
            'type': CostSafeStateDifference,
            'ramp_option': RAMP_CONSTANT,  # How target cost ramps over time.
            'wp_final_multiplier': 1.0,  # Weight multiplier on final time step.
            'data_types': {
                'ee': {
                    'wp': np.array([1.0, 1.0]),  # State weights - must be set.
                    'safe_distance': np.sqrt([safe_radius**2/2, safe_radius**2/2]),
                    'outside_cost': np.array([0.0, 0.0]),
                    'inside_cost': np.array([1.0, 1.0]),
                    'target_state': 'tgt1',  # Target state - must be set.
                    'tgt_idx': self.env.get_state_info(name='tgt1')['idx'][:2],
                    'data_idx': self.env.get_state_info(name='ee')['idx'][:2],
                    'idx_to_use': [0, 1],  # Only X and Y
                },
            },
        }

        cost_final_safe_state_difference = {
            'type': CostSafeStateDifference,
            'ramp_option': RAMP_FINAL_ONLY,  # How target cost ramps over time.
            'wp_final_multiplier': 1.0,  # Weight multiplier on final time step.
            'data_types': {
                'ee': {
                    'wp': np.array([1.0, 1.0]),  # State weights - must be set.
                    'safe_distance': np.sqrt([safe_radius**2/2, safe_radius**2/2]),
                    'outside_cost': np.array([0.0, 0.0]),
                    'inside_cost': np.array([1.0, 1.0]),
                    'target_state': 'tgt1',  # Target state - must be set.
                    'tgt_idx': self.env.get_state_info(name='tgt1')['idx'][:2],
                    'data_idx': self.env.get_state_info(name='ee')['idx'][:2],
                    'idx_to_use': [0, 1],  # Only X and Y
                },
            },
        }


        # Sum costs
        # costs_and_weights = [(act_cost, 1.0e-1),
        costs_and_weights = [(act_cost, 1.0e-5),
                             # # (fk_cost, 1.0e-0),
                             # (fk_l1_cost, 1.5e-1),
                             # (fk_l2_cost, 1.0e-0),
                             # # (fk_final_cost, 1.0e-0),
                             # (fk_l1_final_cost, 1.5e-1),
                             # (fk_l2_final_cost, 1.0e-0),
                             (cost_state_difference, 5.0e-0),
                             (cost_final_state_difference, 1.0e+3),
                             (cost_safe_state_difference, 5.0e+1),
                             (cost_final_safe_state_difference, 5.0e+4),
                             # WORKING:
                             # (cost_safe_distance, 1.0e+1),
                             # (state_cost_distance, 5.0e-0),
                             # (state_final_cost_distance, 1.0e+3),
                             ]

        cost_sum = {
            'type': CostSum,
            'costs': [cw[0] for cw in costs_and_weights],
            'weights': [cw[1] for cw in costs_and_weights],
        }

        return cost_sum
Beispiel #13
0
    def create_cost(self):
        """Instantiate the cost that evaluates the RL agent performance.

        Returns:
            Cost Function

        """
        change_print_color.change('GRAY')
        print("\nCreating Costs...")

        # Action Cost
        act_cost = {
            'type': CostAction,
            'wu': np.ones(self.action_dim) * 1e-4,
            'target': None,  # Target action value
        }

        # FK Cost
        target_distance_hand = np.zeros(6)

        fk_cost = {
            'type': CostFK,
            'ramp_option':
            RAMP_CONSTANT,  # How target cost ramps over time. RAMP_* :CONSTANT, LINEAR, QUADRATIC, FINAL_ONLY
            'target_pose': target_distance_hand,
            'tgt_data_type': 'state',  # 'state' or 'observation'
            'tgt_idx': self.env.get_state_info(name='distance_hand')['idx'],
            'op_point_name': self.env.env_params['hand_name'],
            'op_point_offset': self.env.env_params['hand_offset'],
            'joints_idx': self.env.get_state_info(name='link_position')['idx'],
            'joint_ids': self.env.env_params['joint_ids'],
            'robot_model': self.env.env_params['robot_model'],
            # 'wp': np.array([1.0, 1.0, 1.0, 0.7, 0.8, 0.6]),  # one dim less because 'quat' error | 1)orient 2)pos
            'wp':
            np.array([1.0, 1.0, 1.0, 6.0, 6.0, 3.0
                      ]),  # one dim less because 'quat' error | 1)orient 2)pos
            'evalnorm': evall1l2term,
            #'evalnorm': evallogl2term,
            'l1':
            1.0,  # 1.0,  # 1.0,  # Weight for l1 norm: log(d^2 + alpha) --> Lorentzian rho-function Precise placement at the target
            'l2':
            1.0,  # 1.0,  #1.0e-3,  # Weight for l2 norm: d^2 --> Encourages to quickly get the object in the vicinity of the target
            'alpha':
            1.0e-5,  # e-5,  # Constant added in square root in l1 norm
            'wp_final_multiplier': 1,  # 10
        }

        fk_l1_cost = {
            'type': CostFK,
            'ramp_option':
            RAMP_CONSTANT,  # How target cost ramps over time. RAMP_* :CONSTANT, LINEAR, QUADRATIC, FINAL_ONLY
            'target_pose': target_distance_hand,
            'tgt_data_type': 'state',  # 'state' or 'observation'
            'tgt_idx': self.env.get_state_info(name='distance_hand')['idx'],
            'op_point_name': self.env.env_params['hand_name'],
            'op_point_offset': self.env.env_params['hand_offset'],
            'joints_idx': self.env.get_state_info(name='link_position')['idx'],
            'joint_ids': self.env.env_params['joint_ids'],
            'robot_model': self.env.env_params['robot_model'],
            # 'wp': np.array([1.0, 1.0, 1.0, 0.7, 0.8, 0.6]),  # one dim less because 'quat' error | 1)orient 2)pos
            'wp':
            np.array([1.0, 1.0, 1.0, 6.0, 6.0, 3.0
                      ]),  # one dim less because 'quat' error | 1)orient 2)pos
            'evalnorm': evall1l2term,
            #'evalnorm': evallogl2term,
            'l1':
            1.0,  # 1.0,  # 1.0,  # Weight for l1 norm: log(d^2 + alpha) --> Lorentzian rho-function Precise placement at the target
            'l2':
            0.0,  # 1.0,  #1.0e-3,  # Weight for l2 norm: d^2 --> Encourages to quickly get the object in the vicinity of the target
            'alpha':
            1.0e-5,  # e-5,  # Constant added in square root in l1 norm
            'wp_final_multiplier': 1,  # 10
        }

        fk_l2_cost = {
            'type': CostFK,
            'ramp_option':
            RAMP_CONSTANT,  # How target cost ramps over time. RAMP_* :CONSTANT, LINEAR, QUADRATIC, FINAL_ONLY
            'target_pose': target_distance_hand,
            'tgt_data_type': 'state',  # 'state' or 'observation'
            'tgt_idx': self.env.get_state_info(name='distance_hand')['idx'],
            'op_point_name': self.env.env_params['hand_name'],
            'op_point_offset': self.env.env_params['hand_offset'],
            'joints_idx': self.env.get_state_info(name='link_position')['idx'],
            'joint_ids': self.env.env_params['joint_ids'],
            'robot_model': self.env.env_params['robot_model'],
            # 'wp': np.array([1.0, 1.0, 1.0, 0.7, 0.8, 0.6]),  # one dim less because 'quat' error | 1)orient 2)pos
            'wp':
            np.array([1.0, 1.0, 1.0, 6.0, 6.0, 3.0
                      ]),  # one dim less because 'quat' error | 1)orient 2)pos
            'evalnorm': evall1l2term,
            #'evalnorm': evallogl2term,
            'l1':
            0.0,  # 1.0,  # 1.0,  # Weight for l1 norm: log(d^2 + alpha) --> Lorentzian rho-function Precise placement at the target
            'l2':
            1.0,  # 1.0,  #1.0e-3,  # Weight for l2 norm: d^2 --> Encourages to quickly get the object in the vicinity of the target
            'alpha':
            1.0e-5,  # e-5,  # Constant added in square root in l1 norm
            'wp_final_multiplier': 1,  # 10
        }

        fk_final_cost = {
            'type': CostFK,
            'ramp_option':
            RAMP_FINAL_ONLY,  # How target cost ramps over time. RAMP_* :CONSTANT, LINEAR, QUADRATIC, FINAL_ONLY
            'target_pose': target_distance_hand,
            'tgt_data_type': 'state',  # 'state' or 'observation'
            'tgt_idx': self.env.get_state_info(name='distance_hand')['idx'],
            'op_point_name': self.env.env_params['hand_name'],
            'op_point_offset': self.env.env_params['hand_offset'],
            'joints_idx': self.env.get_state_info(name='link_position')['idx'],
            'joint_ids': self.env.env_params['joint_ids'],
            'robot_model': self.env.env_params['robot_model'],
            'wp':
            np.array([1.0, 1.0, 1.0, 10.0, 10.0, 3.0
                      ]),  # one dim less because 'quat' error | 1)orient 2)pos
            'evalnorm': evall1l2term,
            #'evalnorm': evallogl2term,
            'l1':
            1.0,  # Weight for l1 norm: log(d^2 + alpha) --> Lorentzian rho-function Precise placement at the target
            'l2':
            1.0,  # Weight for l2 norm: d^2 --> Encourages to quickly get the object in the vicinity of the target
            'alpha':
            1.0e-5,  # e-5,  # Constant added in square root in l1 norm
            'wp_final_multiplier': 50,
        }

        fk_l1_final_cost = {
            'type': CostFK,
            'ramp_option':
            RAMP_FINAL_ONLY,  # How target cost ramps over time. RAMP_* :CONSTANT, LINEAR, QUADRATIC, FINAL_ONLY
            'target_pose': target_distance_hand,
            'tgt_data_type': 'state',  # 'state' or 'observation'
            'tgt_idx': self.env.get_state_info(name='distance_hand')['idx'],
            'op_point_name': self.env.env_params['hand_name'],
            'op_point_offset': self.env.env_params['hand_offset'],
            'joints_idx': self.env.get_state_info(name='link_position')['idx'],
            'joint_ids': self.env.env_params['joint_ids'],
            'robot_model': self.env.env_params['robot_model'],
            'wp':
            np.array([1.0, 1.0, 1.0, 10.0, 10.0, 3.0
                      ]),  # one dim less because 'quat' error | 1)orient 2)pos
            'evalnorm': evall1l2term,
            #'evalnorm': evallogl2term,
            'l1':
            1.0,  # Weight for l1 norm: log(d^2 + alpha) --> Lorentzian rho-function Precise placement at the target
            'l2':
            0.0,  # Weight for l2 norm: d^2 --> Encourages to quickly get the object in the vicinity of the target
            'alpha':
            1.0e-5,  # e-5,  # Constant added in square root in l1 norm
            'wp_final_multiplier': 50,
        }

        fk_l2_final_cost = {
            'type': CostFK,
            'ramp_option':
            RAMP_FINAL_ONLY,  # How target cost ramps over time. RAMP_* :CONSTANT, LINEAR, QUADRATIC, FINAL_ONLY
            'target_pose': target_distance_hand,
            'tgt_data_type': 'state',  # 'state' or 'observation'
            'tgt_idx': self.env.get_state_info(name='distance_hand')['idx'],
            'op_point_name': self.env.env_params['hand_name'],
            'op_point_offset': self.env.env_params['hand_offset'],
            'joints_idx': self.env.get_state_info(name='link_position')['idx'],
            'joint_ids': self.env.env_params['joint_ids'],
            'robot_model': self.env.env_params['robot_model'],
            'wp':
            np.array([1.0, 1.0, 1.0, 10.0, 10.0, 3.0
                      ]),  # one dim less because 'quat' error | 1)orient 2)pos
            'evalnorm': evall1l2term,
            #'evalnorm': evallogl2term,
            'l1':
            0.0,  # Weight for l1 norm: log(d^2 + alpha) --> Lorentzian rho-function Precise placement at the target
            'l2':
            1.0,  # Weight for l2 norm: d^2 --> Encourages to quickly get the object in the vicinity of the target
            'alpha':
            1.0e-5,  # e-5,  # Constant added in square root in l1 norm
            'wp_final_multiplier': 50,
        }

        # State Cost
        target_distance_object = np.zeros(6)
        state_cost_distance = {
            'type': CostState,
            'ramp_option':
            RAMP_CONSTANT,  # How target cost ramps over time. RAMP_* :CONSTANT, LINEAR, QUADRATIC, FINAL_ONLY
            'evalnorm': evall1l2term,  # TODO: ALWAYS USE evall1l2term
            'l1': 1.0,  # Weight for l1 norm
            'l2': 0.0,  # Weight for l2 norm
            'alpha': 1e-2,  # Constant added in square root in l1 norm
            'wp_final_multiplier':
            1.0,  # Weight multiplier on final time step.
            'data_types': {
                'distance_object': {
                    # 'wp': np.ones_like(target_state),  # State weights - must be set.
                    'wp':
                    np.array([1.0, 1.0, 1.0, 3.0, 3.0,
                              3.0]),  # State weights - must be set.
                    'target_state':
                    target_distance_object,  # Target state - must be set.
                    'average':
                    None,  # (12, 3),
                    'data_idx':
                    self.env.get_state_info(name='distance_object')['idx']
                },
            },
        }
        state_final_cost_distance = {
            'type': CostState,
            'ramp_option':
            RAMP_FINAL_ONLY,  # How target cost ramps over time. RAMP_* :CONSTANT, LINEAR, QUADRATIC, FINAL_ONLY
            'evalnorm': evall1l2term,  # TODO: ALWAYS USE evall1l2term
            'l1': 1.0,  # Weight for l1 norm
            'l2': 0.0,  # Weight for l2 norm
            'alpha': 1e-2,  # Constant added in square root in l1 norm
            'wp_final_multiplier':
            10.0,  # Weight multiplier on final time step.
            'data_types': {
                'distance_object': {
                    # 'wp': np.ones_like(target_state),  # State weights - must be set.
                    'wp':
                    np.array([1.0, 1.0, 1.0, 3.0, 3.0,
                              3.0]),  # State weights - must be set.
                    'target_state':
                    target_distance_object,  # Target state - must be set.
                    'average':
                    None,  # (12, 3),
                    'data_idx':
                    self.env.get_state_info(name='distance_object')['idx']
                },
            },
        }

        # Sum costs
        des_weights = self.task_params['cost_weights']
        print('Costs weights:', des_weights)
        costs_and_weights = [
            (act_cost, des_weights[0]),
            (fk_l1_cost, des_weights[1]),
            (fk_l2_cost, des_weights[2]),
            (fk_l1_final_cost, des_weights[3]),
            (fk_l2_final_cost, des_weights[4]),
            (state_cost_distance, des_weights[5]),
            (state_final_cost_distance, des_weights[6]),
        ]

        cost_sum = {
            'type': CostSum,
            'costs': [cw[0] for cw in costs_and_weights],
            'weights': [cw[1] for cw in costs_and_weights],
        }

        return cost_sum
Beispiel #14
0
    def create_learning_algo(self):
        change_print_color.change('YELLOW')
        print("\nConfiguring learning algorithm...\n")

        # Learning params
        resume_training_itr = None  # Resume from previous training iteration

        # Dynamics
        learned_dynamics = None

        init_traj_distr = {
            'type': init_pd,
            'init_var': np.array([1.0, 1.0, 1.0]) * 5.0e-01,
            'pos_gains': 0.001,  # float or array
            'vel_gains_mult': 0.01,  # Velocity gains multiplier on pos_gains
            'init_action_offset': None,
            'dJoints': self.env.get_total_joints(),  # Total joints in state
            'state_to_pd': 'joints',  # Joints
            'dDistance': 6,
        }

        # Trajectory Optimization Method
        traj_opt_method = {
            'type': TrajOptPI2,
            'kl_threshold':
            1.0,  # KL-divergence threshold between old and new policies.
            'covariance_damping':
            2.0,  # If greater than zero, covariance is computed as a multiple of the old covariance.
            # Multiplier is taken to the power (1 / covariance_damping). If greater than one, slows
            # down convergence and keeps exploration noise high for more iterations.
            'min_temperature':
            0.001,  # Minimum bound of the temperature optimization for the soft-max probabilities of the
            # policy samples.
            'use_sumexp': False,
            'pi2_use_dgd_eta': False,
            'pi2_cons_per_step': True,
            'min_eta': 1e-8,
            'max_eta': 1e16,
            'del0': 1e-4,
        }

        good_trajs = None
        bad_trajs = None
        pi2trajopt_hyperparams = {
            'inner_iterations': self.task_params[
                'inner_iterations'],  # Times the trajectories are updated
            # Initial dual variables
            'init_eta': 4.62,
            # KL step (epsilon)
            'step_rule':
            'laplace',  # Whether to use 'laplace' or 'mc' cost in step adjustment
            'kl_step': 0.2,  # Kullback-Leibler step (base_step)
            'min_step_mult':
            0.01,  # Min possible value of step multiplier (multiplies kl_step)
            'max_step_mult':
            10.0,  # Max possible value of step multiplier (multiplies kl_step)
        }

        gps_hyperparams = {
            'T': self.task_params['T'],  # Total points
            'dt': self.task_params['Ts'],
            'iterations':
            self.task_params['iterations'],  # GPS episodes --> K iterations
            # Samples
            'num_samples': self.task_params[
                'num_samples'],  # Samples for exploration trajs --> N samples
            'noisy_samples': True,
            'smooth_noise': True,  # Apply Gaussian filter to noise generated
            'smooth_noise_var':
            5.0e+0,  # np.power(2*Ts, 2), # Variance to apply to Gaussian Filter. In Kumar (2016) paper, it is the std dev of 2 Ts
            'smooth_noise_renormalize':
            True,  # Renormalize smooth noise to have variance=1
            'noise_var_scale': 5.e-0 * np.ones(
                self.action_dim
            ),  # Scale to Gaussian noise: N(0, 1)*sqrt(noise_var_scale), only if smooth_noise_renormalize
            # Cost
            'cost': self.cost,
            # Conditions
            'conditions':
            len(self.init_cond),  # Total number of initial conditions
            'train_conditions': self.task_params[
                'train_cond'],  # Indexes of conditions used for training
            # TrajDist
            'init_traj_distr': init_traj_distr,
            'fit_dynamics': False,
            'dynamics': learned_dynamics,
            'initial_state_var': 1e-6,  # Max value for x0sigma in trajectories
            # TrajOpt
            'traj_opt': traj_opt_method,
            'max_ent_traj':
            0.0,  # Weight of maximum entropy term in trajectory optimization #TODO: CHECK THIS VALUE
            # Others
            'algo_hyperparams': pi2trajopt_hyperparams,
            'data_files_dir': self.hyperparams['log_dir'],
        }

        return PI2TrajOpt(self.agent, self.env, **gps_hyperparams)