def load_numpy_model(model_path, numpy_var_list={}): ''' @brief: load numpy variables from npy files. The variables could be from baseline or from ob_normalizer @output: It is worth mentioning that this function only returns the value, but won't load the value (while the tf variables will be loaded at the same time) ''' is_file_valid(model_path) logger.info('LOADING numpy variables') output_save_list = np.load(model_path, encoding='latin1').item() numpy_name_list = [key for key, val in numpy_var_list.items()] # get the weights one by one for name, val in output_save_list.items(): if name in numpy_name_list: logger.info( '\t\tloading numpy pretrained parameters {}'.format(name)) numpy_name_list.remove(name) # just for sanity check numpy_var_list[name] = val else: logger.warning( '\t\t**** Parameters Not Exist **** {}'.format(name)) if len(numpy_name_list) > 0: logger.warning( 'Some parameters are not load from the checkpoint: {}'.format( numpy_name_list)) return numpy_var_list
def __init__(self, args, worker_type, network_type): ''' @brief: the master agent has several actors (or samplers) to do the sampling for it. ''' super(sampler, self).__init__(args, worker_type, network_type) self._base_path = init_path.get_abs_base_dir() if self.args.num_ilqr_traj % self.args.num_workers != 0: logger.warning( 'Using a different number of workers so that number of' + 'planning path is a integer multiple times of the number of' + 'worker Current: {} planning_traj, {} worker'.format( self.args.num_ilqr_traj, self.args.num_workers)) self._damping_args = { 'factor': self.args.LM_damping_factor, 'min_damping': self.args.min_LM_damping, 'max_damping': self.args.max_LM_damping } self._ilqr_data_wrapper = ilqr_data_wrapper.ilqr_data_wrapper( self.args, self._env_info['ob_size'], self._env_info['action_size']) # @self._plan_data is shared with the @ilqr_data_wrapper._plan_data self._plan_data = self._ilqr_data_wrapper.get_plan_data()
def load_tf_model(sess, model_path, tf_var_list=[], ignore_prefix='INVALID'): ''' @brief: load the tensorflow variables from a numpy npy files ''' is_file_valid(model_path) logger.info('\tLOADING tensorflow variables') # load the parameters output_save_list = np.load(model_path, encoding='latin1').item() tf_name_list = [var.name for var in tf_var_list] # get the weights one by one for name, val in output_save_list.items(): if name in tf_name_list: logger.info('\t\tloading TF pretrained parameters {}'.format(name)) tf_name_list.remove(name) # just for sanity check # pick up the variable that has the name var = [var for var in tf_var_list if var.name == name][0] assign_op = var.assign(val) sess.run(assign_op) # or `assign_op.op.run()` else: logger.warning( '\t\t**** Parameters Not Exist **** {}'.format(name)) if len(tf_name_list) > 0: logger.warning( 'Some parameters are not load from the checkpoint: {}'.format( tf_name_list))
def model_load_from_list(sess, model_path, tf_var_list=[], numpy_var_list={}, target_scope_switch='trpo_agent_policy'): ''' @brief: if the var list is given, we just save them @input: @target_scope_switch: ''' if not model_path.endswith('.npy'): model_path = model_path + '.npy' logger.warning('[checkpoint] adding the ".npy" to the path name') logger.info('[checkpoint] loading checkpoint from {}'.format(model_path)) output_save_list = np.load(model_path, encoding='latin1').item() tf_name_list = [var.name for var in tf_var_list] numpy_name_list = [key for key, val in numpy_var_list.items()] # get the weights one by one for name, val in output_save_list.items(): name = name.replace('trpo_agent_policy', target_scope_switch) if name not in tf_name_list and name not in numpy_var_list: logger.info('**** Parameters Not Exist **** {}'.format(name)) continue elif name in tf_name_list: logger.info('loading TF pretrained parameters {}'.format(name)) tf_name_list.remove(name) # just for sanity check # pick up the variable that has the name var = [var for var in tf_var_list if var.name == name][0] assign_op = var.assign(val) sess.run(assign_op) # or `assign_op.op.run()` else: logger.info('loading numpy pretrained parameters {}'.format(name)) numpy_name_list.remove(name) # just for sanity check # pick up the variable that has the name numpy_var_list[name] = val if len(tf_name_list) or len(numpy_name_list) > 0: logger.warning( 'Some parameters are not load from the checkpoint: {}\n {}'.format( tf_name_list, numpy_name_list)) return numpy_var_list
def load_expert_data(traj_data_name, traj_episode_num): # the start of the training traj_base_dir = init_path.get_abs_base_dir() if not traj_data_name.endswith('.npy'): traj_data_name = traj_data_name + '.npy' data_dir = os.path.join(traj_base_dir, traj_data_name) assert os.path.exists(data_dir), \ logger.error('Invalid path: {}'.format(data_dir)) expert_trajectory = np.load(data_dir, encoding="latin1") # choose only the top trajectories if len(expert_trajectory) > traj_episode_num: logger.warning('Using only %d trajs out of %d trajs' % (traj_episode_num, len(expert_trajectory))) expert_trajectory = expert_trajectory[:min(traj_episode_num, len(expert_trajectory))] return expert_trajectory
def train(self, data_dict, replay_buffer, training_info={}): # make sure the needed data is ready assert 'plan_data' in training_info self._plan_data = training_info['plan_data'] self._set_whitening_var(data_dict['whitening_stats']) # step 1: get the target action mean and target precision matrix ''' assert len(self._plan_data) == self._num_traj and \ len(self._plan_data[0]['new_u']) == self._traj_depth num_data = len(self._plan_data) * len(self._plan_data[0]['u']) ''' ''' target_mu = np.zeros([num_data, self._action_size]) target_precision = np.ones([num_data, self._action_size, self._action_size]) ''' training_data, num_data = self._get_training_dataset(data_dict) # step 2: train the mean of the action if num_data < self.args.policy_sub_batch_size: logger.warning("Not enough data!") return {} batch_per_epoch = num_data // self.args.policy_sub_batch_size feed_dict = { self._input_ph['target_action_mu']: training_data['target_mu'], self._input_ph['target_precision']: training_data['target_precision'], self._input_ph['start_state']: training_data['start_state'] } for i_iteration in range(self.args.policy_epochs): data_id = range(num_data) self._npr.shuffle(data_id) avg_loss = [] for i_batch in range(batch_per_epoch): batch_idx = data_id[i_batch * self.args.policy_sub_batch_size:(i_batch + 1) * self.args.policy_sub_batch_size] sub_feed_dict = { key: feed_dict[key][batch_idx] for key in feed_dict } fetch_dict = { 'update_op': self._update_operator['update_op'], 'loss': self._update_operator['loss'] } training_stat = self._session.run(fetch_dict, sub_feed_dict) avg_loss.append(training_stat['loss']) ''' for i_ in range(10000): fetch_dict['pred_act'] = self._tensor['action_dist_mu'] training_stat = self._session.run(fetch_dict, sub_feed_dict) if i_ % 10 == 0: import matplotlib.pyplot as plt print training_stat ga = sub_feed_dict[self._input_ph['target_action_mu']].flatten() plt.plot(ga, label='target') pa = training_stat['pred_act'].flatten() plt.plot(pa, label='pred') plt.legend() plt.show() from util.common.fpdb import fpdb; fpdb().set_trace() ''' logger.info('GPS policy loss {}'.format(np.mean(avg_loss))) # the covariance of the controller self._policy_cov_data['inv_cov'] = \ np.mean(training_data['target_precision'], 0) + \ self.args.gps_policy_cov_damping * \ np.ones([self._action_size, self._action_size]) # self._policy_cov_data['precision'] = \ # np.diag(self._policy_cov_data['inv_cov']) # self._policy_cov_data['cov'] = \ # np.diag(1.0 / self._policy_cov_data['precision']) self._policy_cov_data['var'] = \ 1 / np.diag(self._policy_cov_data['inv_cov']) # vec self._policy_cov_data['sig'] = \ np.diag(self._policy_cov_data['var']) # matrix self._policy_cov_data['chol_pol_covar'] = \ np.diag(np.sqrt(self._policy_cov_data['var'])) # matrix self._policy_cov_data['flat_cov_L'][:] = \ np.diag(self._policy_cov_data['chol_pol_covar']) # vec return training_stat
def _update_parameters(self, rollout_data, training_info={}): # STEP 0: TODO: THE EXPERT DATA reward = [ np.sum(i_rollout_data['rewards']) for i_rollout_data in rollout_data ] logger.warning("mean_reward {}".format(np.mean(reward))) logger.warning("mean_reward {}".format(np.mean(reward))) logger.warning("mean_reward {}".format(np.mean(reward))) logger.warning("mean_reward {}".format(np.mean(reward))) logger.warning("mean_reward {}".format(np.mean(reward))) logger.warning("mean_reward {}".format(np.mean(reward))) logger.warning("mean_reward {}".format(np.mean(reward))) rollout_data = list(np.load('/home/tingwu/mb_baseline/data/test.npy') )[:self.args.num_ilqr_traj] # step 1: preprocess the data and set the reward function for key in self._network: assert len(self._network[key]) == 1 assert len(rollout_data) == self.args.num_ilqr_traj assert len(rollout_data[0]['actions']) == self.args.ilqr_depth self._update_whitening_stats(rollout_data) training_data = self._preprocess_data(rollout_data) training_stats = {'avg_reward': training_data['avg_reward']} self._init_traj_data(training_data) self._set_cost(training_data) # the estimation of the reward function # TODO: TODO: DEBUG!! # step 2: train the dynamics and grab the derivative data dynamics_data = self._network['dynamics'][0].train( training_data, self._replay_buffer) self._set_local_dynamics(dynamics_data) # step 3: fit a local linearization of the policy from rollout data self._network['policy'][0].fit_local_linear_gaussian(training_data) # TODO self._summary_estimation(policy='nn', training_data=training_data, run_forward_pass=True) # TODO # step 4: the variables of MD-GPS optimization self._update_optimization_variable() # step 5: update the traj (local ilqr controller) self._update_traj(training_data) ''' ''' # step 6: update the policy network policy_training_stats = self._network['policy'][0].train( training_data, self._replay_buffer, training_info={'plan_data': self._plan_data}) # step 7: gather and record the training stats self._replay_buffer.add_data(training_data) self._iteration += 1 training_stats.update(policy_training_stats) # TODO: self._summary_estimation(policy='ilqr', training_data=training_data, end_iteration=True) return training_stats