コード例 #1
0
def load_numpy_model(model_path, numpy_var_list={}):
    '''
        @brief: load numpy variables from npy files. The variables could be
            from baseline or from ob_normalizer
        @output:
            It is worth mentioning that this function only returns the value,
            but won't load the value (while the tf variables will be loaded at
            the same time)
    '''
    is_file_valid(model_path)
    logger.info('LOADING numpy variables')

    output_save_list = np.load(model_path, encoding='latin1').item()
    numpy_name_list = [key for key, val in numpy_var_list.items()]

    # get the weights one by one
    for name, val in output_save_list.items():
        if name in numpy_name_list:
            logger.info(
                '\t\tloading numpy pretrained parameters {}'.format(name))
            numpy_name_list.remove(name)  # just for sanity check
            numpy_var_list[name] = val
        else:
            logger.warning(
                '\t\t**** Parameters Not Exist **** {}'.format(name))

    if len(numpy_name_list) > 0:
        logger.warning(
            'Some parameters are not load from the checkpoint: {}'.format(
                numpy_name_list))
    return numpy_var_list
コード例 #2
0
    def __init__(self, args, worker_type, network_type):
        '''
            @brief:
                the master agent has several actors (or samplers) to do the
                sampling for it.
        '''
        super(sampler, self).__init__(args, worker_type, network_type)
        self._base_path = init_path.get_abs_base_dir()
        if self.args.num_ilqr_traj % self.args.num_workers != 0:
            logger.warning(
                'Using a different number of workers so that number of' +
                'planning path is a integer multiple times of the number of' +
                'worker Current: {} planning_traj, {} worker'.format(
                    self.args.num_ilqr_traj, self.args.num_workers))

        self._damping_args = {
            'factor': self.args.LM_damping_factor,
            'min_damping': self.args.min_LM_damping,
            'max_damping': self.args.max_LM_damping
        }
        self._ilqr_data_wrapper = ilqr_data_wrapper.ilqr_data_wrapper(
            self.args, self._env_info['ob_size'],
            self._env_info['action_size'])
        # @self._plan_data is shared with the @ilqr_data_wrapper._plan_data
        self._plan_data = self._ilqr_data_wrapper.get_plan_data()
コード例 #3
0
def load_tf_model(sess, model_path, tf_var_list=[], ignore_prefix='INVALID'):
    '''
        @brief: load the tensorflow variables from a numpy npy files
    '''
    is_file_valid(model_path)
    logger.info('\tLOADING tensorflow variables')

    # load the parameters
    output_save_list = np.load(model_path, encoding='latin1').item()
    tf_name_list = [var.name for var in tf_var_list]

    # get the weights one by one
    for name, val in output_save_list.items():
        if name in tf_name_list:
            logger.info('\t\tloading TF pretrained parameters {}'.format(name))
            tf_name_list.remove(name)  # just for sanity check

            # pick up the variable that has the name
            var = [var for var in tf_var_list if var.name == name][0]

            assign_op = var.assign(val)
            sess.run(assign_op)  # or `assign_op.op.run()`
        else:
            logger.warning(
                '\t\t**** Parameters Not Exist **** {}'.format(name))

    if len(tf_name_list) > 0:
        logger.warning(
            'Some parameters are not load from the checkpoint: {}'.format(
                tf_name_list))
コード例 #4
0
def model_load_from_list(sess,
                         model_path,
                         tf_var_list=[],
                         numpy_var_list={},
                         target_scope_switch='trpo_agent_policy'):
    '''
        @brief:
            if the var list is given, we just save them
        @input:
            @target_scope_switch:
    '''
    if not model_path.endswith('.npy'):
        model_path = model_path + '.npy'
        logger.warning('[checkpoint] adding the ".npy" to the path name')
    logger.info('[checkpoint] loading checkpoint from {}'.format(model_path))

    output_save_list = np.load(model_path, encoding='latin1').item()
    tf_name_list = [var.name for var in tf_var_list]
    numpy_name_list = [key for key, val in numpy_var_list.items()]

    # get the weights one by one
    for name, val in output_save_list.items():
        name = name.replace('trpo_agent_policy', target_scope_switch)
        if name not in tf_name_list and name not in numpy_var_list:
            logger.info('**** Parameters Not Exist **** {}'.format(name))
            continue
        elif name in tf_name_list:
            logger.info('loading TF pretrained parameters {}'.format(name))
            tf_name_list.remove(name)  # just for sanity check

            # pick up the variable that has the name
            var = [var for var in tf_var_list if var.name == name][0]
            assign_op = var.assign(val)
            sess.run(assign_op)  # or `assign_op.op.run()`
        else:
            logger.info('loading numpy pretrained parameters {}'.format(name))
            numpy_name_list.remove(name)  # just for sanity check

            # pick up the variable that has the name
            numpy_var_list[name] = val

    if len(tf_name_list) or len(numpy_name_list) > 0:
        logger.warning(
            'Some parameters are not load from the checkpoint: {}\n {}'.format(
                tf_name_list, numpy_name_list))
    return numpy_var_list
コード例 #5
0
def load_expert_data(traj_data_name, traj_episode_num):
    # the start of the training
    traj_base_dir = init_path.get_abs_base_dir()

    if not traj_data_name.endswith('.npy'):
        traj_data_name = traj_data_name + '.npy'
    data_dir = os.path.join(traj_base_dir, traj_data_name)

    assert os.path.exists(data_dir), \
        logger.error('Invalid path: {}'.format(data_dir))
    expert_trajectory = np.load(data_dir, encoding="latin1")

    # choose only the top trajectories
    if len(expert_trajectory) > traj_episode_num:
        logger.warning('Using only %d trajs out of %d trajs' %
                       (traj_episode_num, len(expert_trajectory)))
    expert_trajectory = expert_trajectory[:min(traj_episode_num,
                                               len(expert_trajectory))]
    return expert_trajectory
コード例 #6
0
    def train(self, data_dict, replay_buffer, training_info={}):
        # make sure the needed data is ready
        assert 'plan_data' in training_info
        self._plan_data = training_info['plan_data']
        self._set_whitening_var(data_dict['whitening_stats'])

        # step 1: get the target action mean and target precision matrix
        '''
        assert len(self._plan_data) == self._num_traj and \
            len(self._plan_data[0]['new_u']) == self._traj_depth
        num_data = len(self._plan_data) * len(self._plan_data[0]['u'])
        '''
        '''
        target_mu = np.zeros([num_data, self._action_size])
        target_precision = np.ones([num_data, self._action_size,
                                    self._action_size])
        '''
        training_data, num_data = self._get_training_dataset(data_dict)

        # step 2: train the mean of the action
        if num_data < self.args.policy_sub_batch_size:
            logger.warning("Not enough data!")
            return {}

        batch_per_epoch = num_data // self.args.policy_sub_batch_size
        feed_dict = {
            self._input_ph['target_action_mu']: training_data['target_mu'],
            self._input_ph['target_precision']:
            training_data['target_precision'],
            self._input_ph['start_state']: training_data['start_state']
        }
        for i_iteration in range(self.args.policy_epochs):
            data_id = range(num_data)
            self._npr.shuffle(data_id)
            avg_loss = []

            for i_batch in range(batch_per_epoch):
                batch_idx = data_id[i_batch *
                                    self.args.policy_sub_batch_size:(i_batch +
                                                                     1) *
                                    self.args.policy_sub_batch_size]
                sub_feed_dict = {
                    key: feed_dict[key][batch_idx]
                    for key in feed_dict
                }

                fetch_dict = {
                    'update_op': self._update_operator['update_op'],
                    'loss': self._update_operator['loss']
                }
                training_stat = self._session.run(fetch_dict, sub_feed_dict)
                avg_loss.append(training_stat['loss'])
                '''
                for i_ in range(10000):
                    fetch_dict['pred_act'] = self._tensor['action_dist_mu']
                    training_stat = self._session.run(fetch_dict, sub_feed_dict)
                    if i_ % 10 == 0:
                        import matplotlib.pyplot as plt
                        print training_stat
                        ga = sub_feed_dict[self._input_ph['target_action_mu']].flatten()
                        plt.plot(ga, label='target')
                        pa = training_stat['pred_act'].flatten()
                        plt.plot(pa, label='pred')
                        plt.legend()
                        plt.show()
                        from util.common.fpdb import fpdb; fpdb().set_trace()
                '''
            logger.info('GPS policy loss {}'.format(np.mean(avg_loss)))

        # the covariance of the controller
        self._policy_cov_data['inv_cov'] = \
            np.mean(training_data['target_precision'], 0) + \
            self.args.gps_policy_cov_damping * \
            np.ones([self._action_size, self._action_size])
        # self._policy_cov_data['precision'] = \
        # np.diag(self._policy_cov_data['inv_cov'])
        # self._policy_cov_data['cov'] = \
        # np.diag(1.0 / self._policy_cov_data['precision'])
        self._policy_cov_data['var'] = \
            1 / np.diag(self._policy_cov_data['inv_cov'])  # vec
        self._policy_cov_data['sig'] = \
            np.diag(self._policy_cov_data['var'])  # matrix
        self._policy_cov_data['chol_pol_covar'] = \
            np.diag(np.sqrt(self._policy_cov_data['var']))  # matrix
        self._policy_cov_data['flat_cov_L'][:] = \
            np.diag(self._policy_cov_data['chol_pol_covar'])  # vec

        return training_stat
コード例 #7
0
    def _update_parameters(self, rollout_data, training_info={}):
        # STEP 0: TODO: THE EXPERT DATA
        reward = [
            np.sum(i_rollout_data['rewards'])
            for i_rollout_data in rollout_data
        ]
        logger.warning("mean_reward {}".format(np.mean(reward)))
        logger.warning("mean_reward {}".format(np.mean(reward)))
        logger.warning("mean_reward {}".format(np.mean(reward)))
        logger.warning("mean_reward {}".format(np.mean(reward)))
        logger.warning("mean_reward {}".format(np.mean(reward)))
        logger.warning("mean_reward {}".format(np.mean(reward)))
        logger.warning("mean_reward {}".format(np.mean(reward)))
        rollout_data = list(np.load('/home/tingwu/mb_baseline/data/test.npy')
                            )[:self.args.num_ilqr_traj]
        # step 1: preprocess the data and set the reward function
        for key in self._network:
            assert len(self._network[key]) == 1
        assert len(rollout_data) == self.args.num_ilqr_traj
        assert len(rollout_data[0]['actions']) == self.args.ilqr_depth
        self._update_whitening_stats(rollout_data)
        training_data = self._preprocess_data(rollout_data)
        training_stats = {'avg_reward': training_data['avg_reward']}
        self._init_traj_data(training_data)
        self._set_cost(training_data)  # the estimation of the reward function

        # TODO: TODO: DEBUG!!
        # step 2: train the dynamics and grab the derivative data
        dynamics_data = self._network['dynamics'][0].train(
            training_data, self._replay_buffer)
        self._set_local_dynamics(dynamics_data)

        # step 3: fit a local linearization of the policy from rollout data
        self._network['policy'][0].fit_local_linear_gaussian(training_data)

        # TODO
        self._summary_estimation(policy='nn',
                                 training_data=training_data,
                                 run_forward_pass=True)

        # TODO
        # step 4: the variables of MD-GPS optimization
        self._update_optimization_variable()

        # step 5: update the traj (local ilqr controller)
        self._update_traj(training_data)
        '''
        '''

        # step 6: update the policy network
        policy_training_stats = self._network['policy'][0].train(
            training_data,
            self._replay_buffer,
            training_info={'plan_data': self._plan_data})

        # step 7: gather and record the training stats
        self._replay_buffer.add_data(training_data)
        self._iteration += 1
        training_stats.update(policy_training_stats)

        # TODO:
        self._summary_estimation(policy='ilqr',
                                 training_data=training_data,
                                 end_iteration=True)

        return training_stats