def __init__(self, args, observation_size, action_size, network_type, task_queue, result_queue, worker_id, name_scope='planning_worker'): # the multiprocessing initialization multiprocessing.Process.__init__(self) self.args = args self._name_scope = name_scope self._worker_id = worker_id self._network_type = network_type self._npr = np.random.RandomState(args.seed + self._worker_id) self._observation_size = observation_size self._action_size = action_size self._task_queue = task_queue self._result_queue = result_queue logger.info('Worker {} online'.format(self._worker_id)) self._base_dir = init_path.get_base_dir()
def load_tf_model(sess, model_path, tf_var_list=[], ignore_prefix='INVALID'): ''' @brief: load the tensorflow variables from a numpy npy files ''' is_file_valid(model_path) logger.info('\tLOADING tensorflow variables') # load the parameters output_save_list = np.load(model_path, encoding='latin1').item() tf_name_list = [var.name for var in tf_var_list] # get the weights one by one for name, val in output_save_list.items(): if name in tf_name_list: logger.info('\t\tloading TF pretrained parameters {}'.format(name)) tf_name_list.remove(name) # just for sanity check # pick up the variable that has the name var = [var for var in tf_var_list if var.name == name][0] assign_op = var.assign(val) sess.run(assign_op) # or `assign_op.op.run()` else: logger.warning( '\t\t**** Parameters Not Exist **** {}'.format(name)) if len(tf_name_list) > 0: logger.warning( 'Some parameters are not load from the checkpoint: {}'.format( tf_name_list))
def _play(self, planning_data): if self.args.num_expert_episode_to_save > 0 and \ self._previous_reward > self._env_solved_reward and \ self._worker_id == 0: start_save_episode = True logger.info('Last episodic reward: %.4f' % self._previous_reward) logger.info('Minimum reward of %.4f is needed to start saving' % self._env_solved_reward) logger.info('[SAVING] Worker %d will record its episode data' % self._worker_id) else: start_save_episode = False if self.args.num_expert_episode_to_save > 0 \ and self._worker_id == 0: logger.info('Last episodic reward: %.4f' % self._previous_reward) logger.info( 'Minimum reward of %.4f is needed to start saving' % self._env_solved_reward) traj_episode = play_episode_with_env( self._env, self._act, { 'use_random_action': planning_data['use_random_action'], 'record_flag': start_save_episode, 'num_episode': self.args.num_expert_episode_to_save, 'data_name': self.args.task + '_' + self.args.exp_id }) self._previous_reward = np.sum(traj_episode['rewards']) return traj_episode
def rollouts_using_worker_planning(self, num_timesteps=None, use_random_action=False): ''' @brief: Run the experiments until a total of @timesteps_per_batch timesteps are collected. ''' self._current_iteration += 1 num_timesteps_received = 0 timesteps_needed = self.args.timesteps_per_batch \ if num_timesteps is None else num_timesteps rollout_data = [] while True: # init the data self._ilqr_data_wrapper.init_episode_data() traj_episode = self._play(use_random_action) logger.info('done with episode') rollout_data.append(traj_episode) num_timesteps_received += len(traj_episode['rewards']) # update average timesteps per episode timesteps_needed = self.args.timesteps_per_batch - \ num_timesteps_received if timesteps_needed <= 0 or self.args.test: break logger.info('{} timesteps from {} episodes collected'.format( num_timesteps_received, len(rollout_data))) return {'data': rollout_data}
def _act(self, state, control_info={'use_random_action': False}): if 'use_random_action' in control_info and \ control_info['use_random_action']: # use random policy action = self._npr.uniform(-1, 1, [self._action_size]) return action, [-1], [-1] else: # update the data self._update_plan_data(state) pred_reward = [ -self._plan_data[i_traj]['l'].sum() for i_traj in range(self.args.num_ilqr_traj) ] for _ in range(self.args.ilqr_iteration): self._backward_pass() self._forward_pass() # logging information for i_traj in range(self.args.num_ilqr_traj): diff = -self._plan_data[i_traj]['l'].sum() - pred_reward[i_traj] logger.info('Traj {}: Pred ({}) + ({})'.format( i_traj, pred_reward[i_traj], diff)) # get control signals from the best traj traj_id = np.argsort( [np.sum(traj_data['l']) for traj_data in self._plan_data])[0] return self._plan_data[traj_id]['u'][0], [-1], [-1]
def is_file_valid(model_path, save_file=False): assert model_path.endswith('.npy'), logger.error( 'Invalid file provided {}'.format(model_path)) if not save_file: assert os.path.exists(model_path), logger.error( 'file not found: {}'.format(model_path)) logger.info('[LOAD/SAVE] checkpoint path is {}'.format(model_path))
def load_numpy_model(model_path, numpy_var_list={}): ''' @brief: load numpy variables from npy files. The variables could be from baseline or from ob_normalizer @output: It is worth mentioning that this function only returns the value, but won't load the value (while the tf variables will be loaded at the same time) ''' is_file_valid(model_path) logger.info('LOADING numpy variables') output_save_list = np.load(model_path, encoding='latin1').item() numpy_name_list = [key for key, val in numpy_var_list.items()] # get the weights one by one for name, val in output_save_list.items(): if name in numpy_name_list: logger.info( '\t\tloading numpy pretrained parameters {}'.format(name)) numpy_name_list.remove(name) # just for sanity check numpy_var_list[name] = val else: logger.warning( '\t\t**** Parameters Not Exist **** {}'.format(name)) if len(numpy_name_list) > 0: logger.warning( 'Some parameters are not load from the checkpoint: {}'.format( numpy_name_list)) return numpy_var_list
def train(trainer, sampler, worker, dynamics, policy, reward, args=None): logger.info('Training starts at {}'.format(init_path.get_abs_base_dir())) network_type = {'policy': policy, 'dynamics': dynamics, 'reward': reward} # make the trainer and sampler sampler_agent = make_sampler(sampler, worker, network_type, args) trainer_tasks, trainer_results, trainer_agent, init_weights = \ make_trainer(trainer, network_type, args) sampler_agent.set_weights(init_weights) timer_dict = OrderedDict() timer_dict['Program Start'] = time.time() totalsteps = 0 current_iteration = 0 while True: timer_dict['** Program Total Time **'] = time.time() # step 1: collect rollout data if current_iteration == 0 and args.random_timesteps > 0 and \ (not (args.gt_dynamics and args.gt_reward)): # we could first generate random rollout data for exploration logger.info( 'Generating {} random timesteps'.format(args.random_timesteps) ) rollout_data = sampler_agent.rollouts_using_worker_planning( args.random_timesteps, use_random_action=True ) else: rollout_data = sampler_agent.rollouts_using_worker_planning() timer_dict['Generate Rollout'] = time.time() # step 2: train the weights for dynamics and policy network training_info = {'network_to_train': ['dynamics', 'reward', 'policy']} trainer_tasks.put( (parallel_util.TRAIN_SIGNAL, {'data': rollout_data['data'], 'training_info': training_info}) ) trainer_tasks.join() training_return = trainer_results.get() timer_dict['Train Weights'] = time.time() # step 4: update the weights sampler_agent.set_weights(training_return['network_weights']) timer_dict['Assign Weights'] = time.time() # log and print the results log_results(training_return, timer_dict) totalsteps = training_return['totalsteps'] if totalsteps > args.max_timesteps: break else: current_iteration += 1 # end of training sampler_agent.end() trainer_tasks.put((parallel_util.END_SIGNAL, None))
def _get_groundtruth_reward(self, rollout_data, training_stats): for i_episode in rollout_data: i_episode['raw_episodic_reward'] = sum(i_episode['raw_rewards']) avg_reward = np.mean( [i_episode['raw_episodic_reward'] for i_episode in rollout_data]) logger.info('Raw reward: {}'.format(avg_reward)) training_stats['RAW_reward'] = avg_reward
def pred(self, data_dict): logger.info('This function should not be used!') reward = [] for i_data in range(len(data_dict['action'])): i_reward = self._env.reward( {key: data_dict[key][i_data] for key in ['start_state', 'action']} ) reward.append(i_reward) return np.stack(reward), -1, -1
def train_mf(mb_steps, policy_weight, trainer, sampler, worker, dynamics, policy, reward, args=None): logger.info('Training starts at {}'.format(init_path.get_abs_base_dir())) network_type = {'policy': policy, 'dynamics': dynamics, 'reward': reward} # make the trainer and sampler sampler_agent = make_sampler(sampler, worker, network_type, args) trainer_tasks, trainer_results, trainer_agent, init_weights = \ make_trainer(trainer, network_type, args) # Initialize the policy with dagger policy weight. trainer_tasks.put((parallel_util.SET_POLICY_WEIGHT, policy_weight)) trainer_tasks.join() init_weights['policy'][0] = policy_weight sampler_agent.set_weights(init_weights) timer_dict = OrderedDict() timer_dict['Program Start'] = time.time() current_iteration = 0 while True: timer_dict['** Program Total Time **'] = time.time() # step 1: collect rollout data rollout_data = \ sampler_agent.rollouts_using_worker_playing(use_true_env=True) timer_dict['Generate Rollout'] = time.time() # step 2: train the weights for dynamics and policy network training_info = {'network_to_train': ['dynamics', 'reward', 'policy']} trainer_tasks.put( (parallel_util.TRAIN_SIGNAL, {'data': rollout_data['data'], 'training_info': training_info}) ) trainer_tasks.join() training_return = trainer_results.get() timer_dict['Train Weights'] = time.time() # step 4: update the weights sampler_agent.set_weights(training_return['network_weights']) timer_dict['Assign Weights'] = time.time() # log and print the results log_results(training_return, timer_dict, mb_steps) if training_return['totalsteps'] > args.max_timesteps: break else: current_iteration += 1 # end of training sampler_agent.end() trainer_tasks.put((parallel_util.END_SIGNAL, None))
def train_initial_policy(self, data_dict, replay_buffer, training_info={}): # get the validation set # Hack the policy val percentage to 0.1 for policy initialization. self.args.policy_val_percentage = 0.1 new_data_id = list(range(len(data_dict['start_state']))) self._npr.shuffle(new_data_id) num_val = int(len(new_data_id) * self.args.policy_val_percentage) val_data = { key: data_dict[key][new_data_id][:num_val] for key in ['start_state', 'end_state', 'action'] } # get the training set train_data = { key: data_dict[key][new_data_id][num_val:] for key in ['start_state', 'end_state', 'action'] } for i_epoch in range(self.args.dagger_epoch): # get the number of batches num_batches = len(train_data['action']) // \ self.args.initial_policy_bs # from util.common.fpdb import fpdb; fpdb().set_trace() assert num_batches > 0, logger.error('batch_size > data_set') avg_training_loss = [] for i_batch in range(num_batches): # train for each sub batch feed_dict = { self._input_ph[key]: train_data[key][ i_batch * self.args.initial_policy_bs: (i_batch + 1) * self.args.initial_policy_bs ] for key in ['start_state', 'action'] } fetch_dict = { 'update_op': self._update_operator['initial_update_op'], 'train_loss': self._update_operator['initial_policy_loss'] } training_stat = self._session.run(fetch_dict, feed_dict) avg_training_loss.append(training_stat['train_loss']) val_loss = self.eval(val_data) logger.info( '[dynamics at epoch {}]: Val Loss: {}, Train Loss: {}'.format( i_epoch, val_loss, np.mean(avg_training_loss) ) ) training_stat['val_loss'] = val_loss training_stat['avg_train_loss'] = np.mean(avg_training_loss) return training_stat
def rollouts_using_worker_playing(self, num_timesteps=None, use_random_action=False, use_true_env=False): """ @brief: In this case, the sampler will call workers to generate data """ self._current_iteration += 1 num_timesteps_received = 0 numsteps_indicator = False if num_timesteps is None else True timesteps_needed = self.args.timesteps_per_batch \ if num_timesteps is None else num_timesteps rollout_data = [] while True: # how many episodes are expected to complete the current dataset? num_estimiated_episode = max( int(np.ceil(timesteps_needed / self._avg_episode_len)), 1) # send out the task for each worker to play for _ in range(num_estimiated_episode): self._task_queue.put((parallel_util.WORKER_PLAYING, { 'use_true_env': use_true_env, 'use_random_action': use_random_action })) self._task_queue.join() # collect the data for _ in range(num_estimiated_episode): traj_episode = self._result_queue.get() rollout_data.append(traj_episode) num_timesteps_received += len(traj_episode['rewards']) # update average timesteps per episode and timestep remains self._avg_episode_len = \ float(num_timesteps_received) / len(rollout_data) if numsteps_indicator: timesteps_needed = num_timesteps - \ num_timesteps_received else: timesteps_needed = self.args.timesteps_per_batch - \ num_timesteps_received logger.info('Finished {}th episode'.format(len(rollout_data))) if timesteps_needed <= 0 or self.args.test: break logger.info('{} timesteps from {} episodes collected'.format( num_timesteps_received, len(rollout_data))) return {'data': rollout_data}
def train(self, data_dict, replay_buffer, training_info={}): # update the whitening stats of the network self._set_whitening_var(data_dict['whitening_stats']) # get the validation data new_data_id = list(range(len(data_dict['start_states']))) self._npr.shuffle(new_data_id) num_val = max(int(len(new_data_id) * self.args.dynamics_val_percentage), self.args.dynamics_val_max_size) val_data = { 'start_states': data_dict['start_states'][new_data_id][:num_val], 'end_states': data_dict['end_states'[new_data_id]][:num_val], 'actions': data_dict['actions'][new_data_id][:num_val], } # TODO(GD): update coeff total_iters = 0 for i_epochs in range(self.args.dynamics_epochs): train_data = self._replay_buffer.get_all_data(self) num_batches = len(train_data) // self.args.dynamics_batch_size avg_training_loss = [] for i_batch in range(num_batches): # feed in the sub-batch feed_dict = { self._input_ph[key]: train_data[key][ i_batch * self.args.dynamics_batch_size: (i_batch + 1) * self.args.dynamics_batch_size ] for key in ['start_states', 'end_states', 'actions'] } fetch_dict = { 'update_op': self._update_operator['update_op'], 'loss': self._update_operator['loss'] } training_stat = self._session.run(fetch_dict, feed_dict) avg_training_loss.append(training_stat['loss']) if total_iters % 2 == 0: self._session.run(self._update_operator['cov_update_op'], feed_dict) if total_iters % 20 == 0: self._session.run(self._update_operator['var_update_op']) val_loss = self.eval(val_data) logger.info('[dynamics]: Val Loss: {}, Train Loss'.format( val_loss, np.mean(avg_training_loss)) )
def save_tf_model(sess, model_path, tf_var_list=[]): ''' @brief: save the tensorflow variables into a numpy npy file ''' is_file_valid(model_path, save_file=True) logger.info('\tSAVING tensorflow variables') # get the tf weights one by one output_save_list = dict() for var in tf_var_list: weights = sess.run(var) output_save_list[var.name] = weights logger.info('\t\t[Checkpoint] saving tf parameter {}'.format(var.name)) # save the model np.save(model_path, output_save_list)
def save_numpy_model(model_path, numpy_var_list=[]): ''' @brief: save the numpy variables into a numpy npy file ''' is_file_valid(model_path, save_file=True) logger.info('\tSAVING numpy variables') # get the numpy weights one by one output_save_list = dict() for key, var in numpy_var_list.items(): output_save_list[key] = var logger.info('\t\t[Checkpoint] saving numpy parameter {}'.format(key)) # save the model np.save(model_path, output_save_list)
def _preprocess_data(self, rollout_data): """ @brief: Process the data, collect the element of ['start_state', 'end_state', 'action', 'reward', 'return', 'ob', 'action_dist_mu', 'action_dist_logstd'] """ # get the observations training_data = {} # get the returns (might be needed to train policy) for i_episode in rollout_data: i_episode["returns"] = \ misc_utils.get_return(i_episode["rewards"], self.args.gamma) training_data['start_state'] = np.concatenate( [i_episode['obs'][:-1] for i_episode in rollout_data]) training_data['end_state'] = np.concatenate( [i_episode['obs'][1:] for i_episode in rollout_data]) for key in [ 'action', 'reward', 'return', 'old_action_dist_mu', 'old_action_dist_logstd' ]: training_data[key] = np.concatenate( [i_episode[key + 's'][:] for i_episode in rollout_data]) # record the length training_data['episode_length'] = \ [len(i_episode['rewards']) for i_episode in rollout_data] # get the episodic reward for i_episode in rollout_data: i_episode['episodic_reward'] = sum(i_episode['rewards']) avg_reward = np.mean( [i_episode['episodic_reward'] for i_episode in rollout_data]) logger.info('Mean reward: {}'.format(avg_reward)) training_data['whitening_stats'] = self._whitening_stats training_data['avg_reward'] = avg_reward training_data['avg_reward_std'] = \ np.std([i_episode['episodic_reward'] for i_episode in rollout_data]) training_data['rollout_data'] = rollout_data # update timesteps so far self._timesteps_so_far += len(training_data['action']) return training_data
def load_expert_trajectory(traj_data_name, traj_episode_num): ''' @brief: load the expert trajectory. It could either be a full trajectory or keyframe states. @output: The expert_trajectory is a list of dict. Each dict corresponds to one episode, and has key of 'observation', and 'timestep'. The size of expert_trajectory[0]['observation'] is @num_timestep by @(num_ob_size) example: expert_trajectory[0]['timestep'] = [2, 3, 5, ...] ''' expert_trajectory = load_expert_data(traj_data_name, traj_episode_num) expert_trajectory_obs = np.concatenate( [i_traj['observation'] for i_traj in expert_trajectory]) logger.info('Loaded expert trajectory') logger.info('Num_traj: {}, size: {}'.format(len(expert_trajectory), expert_trajectory_obs.shape)) return expert_trajectory_obs
def model_save_from_list(sess, model_path, tf_var_list=[], numpy_var_list={}): ''' @brief: if the var list is given, we just save them ''' if not model_path.endswith('.npy'): model_path = model_path + '.npy' logger.info('saving checkpoint to {}'.format(model_path)) output_save_list = dict() # get the tf weights one by one for var in tf_var_list: weights = sess.run(var) output_save_list[var.name] = weights logger.info('[checkpoint] saving tf parameter {}'.format(var.name)) # get the numpy weights one by one for key, var in numpy_var_list.items(): output_save_list[key] = var logger.info('[checkpoint] saving numpy parameter {}'.format(key)) # save the model np.save(model_path, output_save_list) return
def _loss_function(self, sol, fetch_data_dict={}): """ @brief: the loss function to be used by the LBFGS optimizer @fetch_data_dict: We can fetch some intermediate variables (interpolated qpos / qvel / qacc) """ if self._camera_info['mode'] in ['static', 'trackcom']: # only the qposes sol_qpos = sol[self._var_to_sol_id['qpos']] sol_qpos = sol_qpos.reshape([-1, self._len_qpos]) camera_state = sol[self._var_to_sol_id['camera_state']] total_loss, _fetch_data_dict = \ self._loss_from_sol_qpos_camera_state(sol_qpos, camera_state) else: raise NotImplementedError # TODO for free # gather the data that can be reused for key in fetch_data_dict: fetch_data_dict[key] = _fetch_data_dict[key] logger.info("Current loss: {}".format(total_loss)) logger.info("\tphysics loss: {}".format( np.mean(_fetch_data_dict['physics_loss'])) ) logger.info("\tproject loss: {}".format( np.mean(_fetch_data_dict['projection_loss'])) ) return total_loss
def __init__(self, sess, summary_name, enable=True, summary_dir=None): # the interface we need self.summary = None self.sess = sess self.enable = enable if not self.enable: # the summary handler is disabled return if summary_dir is None: self.path = os.path.join( init_path.get_base_dir(), 'summary' ) else: self.path = os.path.join(summary_dir, 'summary') self.path = os.path.abspath(self.path) if not os.path.exists(self.path): os.makedirs(self.path) self.path = os.path.join(self.path, summary_name) self.train_writer = tf.summary.FileWriter(self.path, self.sess.graph) logger.info( 'summary write initialized, writing to {}'.format(self.path))
def _build_models(self): self._build_session() self._network = {'policy': [], 'dynamics': [], 'reward': []} self._num_model_ensemble = { 'policy': max(1, self.args.num_policy_ensemble), 'dynamics': max(1, self.args.num_dynamics_ensemble), 'reward': max(1, self.args.num_reward_ensemble), } for key in ['policy', 'dynamics', 'reward']: for i_model in range(self._num_model_ensemble[key]): name_scope = self._name_scope + '_' + key + '_' + str(i_model) self._network[key].append(self._network_type[key]( self.args, self._session, name_scope, self._observation_size, self._action_size)) with tf.variable_scope(name_scope): self._network[key][-1].build_network() self._network[key][-1].build_loss() logger.info('Trainer maintains [{}] {} network'.format( self._num_model_ensemble[key], key)) # init the weights self._session.run(tf.global_variables_initializer())
def model_load_from_list(sess, model_path, tf_var_list=[], numpy_var_list={}, target_scope_switch='trpo_agent_policy'): ''' @brief: if the var list is given, we just save them @input: @target_scope_switch: ''' if not model_path.endswith('.npy'): model_path = model_path + '.npy' logger.warning('[checkpoint] adding the ".npy" to the path name') logger.info('[checkpoint] loading checkpoint from {}'.format(model_path)) output_save_list = np.load(model_path, encoding='latin1').item() tf_name_list = [var.name for var in tf_var_list] numpy_name_list = [key for key, val in numpy_var_list.items()] # get the weights one by one for name, val in output_save_list.items(): name = name.replace('trpo_agent_policy', target_scope_switch) if name not in tf_name_list and name not in numpy_var_list: logger.info('**** Parameters Not Exist **** {}'.format(name)) continue elif name in tf_name_list: logger.info('loading TF pretrained parameters {}'.format(name)) tf_name_list.remove(name) # just for sanity check # pick up the variable that has the name var = [var for var in tf_var_list if var.name == name][0] assign_op = var.assign(val) sess.run(assign_op) # or `assign_op.op.run()` else: logger.info('loading numpy pretrained parameters {}'.format(name)) numpy_name_list.remove(name) # just for sanity check # pick up the variable that has the name numpy_var_list[name] = val if len(tf_name_list) or len(numpy_name_list) > 0: logger.warning( 'Some parameters are not load from the checkpoint: {}\n {}'.format( tf_name_list, numpy_name_list)) return numpy_var_list
def log_results(results, timer_dict, start_timesteps=0): logger.info("-" * 15 + " Iteration %d " % results['iteration'] + "-" * 15) for i_id in range(len(timer_dict) - 1): start_key, end_key = list(timer_dict.keys())[i_id: i_id + 2] time_elapsed = (timer_dict[end_key] - timer_dict[start_key]) / 60.0 logger.info("Time elapsed for [{}] is ".format(end_key) + "%.4f mins" % time_elapsed) logger.info("{} total steps have happened".format(results['totalsteps'])) # the stats from tensorboard_logger import log_value for key in results['stats']: logger.info("[{}]: {}".format(key, results['stats'][key])) if results['stats'][key] is not None: log_value(key, results['stats'][key], start_timesteps + results['totalsteps'])
def _get_fd_gradient(self, sol): """ @brief: use finite_difference to calculate the gradient. Due to the locality of the solution space, we can use some small trick to speed up the gradient process """ gradient = np.zeros([1, len(sol)]) epsilon = 1e-3 # used for finite difference # get the base values, and the base interpolation values: center_data_dict = {'physics_loss': None, 'projection_loss': None} center_loss = self._loss_function(sol, fetch_data_dict=center_data_dict) sol_qpos = np.reshape(sol[self._var_to_sol_id['qpos']], [-1, self._len_qpos]) camera_state = sol[self._var_to_sol_id['camera_state']] if 'qpos' in self._opt_var_list: logger.info('Calculating the gradient of qpos') # utilize the local connectivity of the qpos # locate the id of the qpos for i_derivative in range(self._num_sol_qpos * self._len_qpos): sol_id = i_derivative + self._var_to_sol_id['qpos'][0] center_sol_qpos_id = i_derivative // self._len_qpos start_sol_qpos_id = max(center_sol_qpos_id - 3, 0) end_sol_qpos_id = min(center_sol_qpos_id + 3, self._num_sol_qpos - 1) # get everything within the range of [start_sol_qpos_id, # end_sol_qpos_id], take the forward finite difference step forward_sol_qpos = np.array( sol_qpos[start_sol_qpos_id: end_sol_qpos_id + 1], copy=True ) forward_sol_qpos[center_sol_qpos_id - start_sol_qpos_id, i_derivative % self._len_qpos] += epsilon forward_loss, forward_data_dict = \ self._loss_from_sol_qpos_camera_state( forward_sol_qpos, camera_state, center_sol_qpos_id=center_sol_qpos_id ) center_physics_loss = center_data_dict['physics_loss'][ start_sol_qpos_id * self._sol_qpos_freq: end_sol_qpos_id * self._sol_qpos_freq ] center_projection_loss = center_data_dict['projection_loss'][ start_sol_qpos_id * self._sol_qpos_freq: end_sol_qpos_id * self._sol_qpos_freq + 1 ] # make sure the ids are matched assert len(forward_data_dict['physics_loss']) == \ len(center_physics_loss) and \ len(forward_data_dict['projection_loss']) == \ len(center_projection_loss) difference_of_loss = forward_loss - \ np.mean(center_physics_loss) - \ np.mean(center_projection_loss) gradient[0, sol_id] = difference_of_loss for opt_var in ['xyz_pos', 'cam_view', 'fov', 'image_size']: if opt_var not in self._opt_var_list: continue logger.info('Calculating the gradient of {}'.format(opt_var)) # TODO: for xyz_pos / fov / image_size, there is speed-up available for i_derivative in range(len(self._var_to_sol_id[opt_var])): sol_id = i_derivative + self._var_to_sol_id[opt_var][0] camera_state_id = sol_id - len(self._var_to_sol_id['qpos']) forward_camera_state = np.array(camera_state, copy=True) if opt_var == 'cam_view': # for quaternion, take care of the length invariance quat_id = self._var_to_sol_id['quaternion'] raise NotImplementedError forward_camera_state[camera_state_id] += \ epsilon * np.linalg.norm(sol[quat_id]) else: forward_camera_state[camera_state_id] += epsilon forward_loss, _ = self._loss_from_sol_qpos_camera_state( sol_qpos, forward_camera_state ) gradient[0, sol_id] = forward_loss - center_loss if len(self._opt_var_list) == 0: raise ValueError('At least one of the var needs to be optimzied') logger.info('Gradient calculated') return gradient
def run(self): self._build_model() while True: next_task = self._task_queue.get(block=True) if next_task[0] == parallel_util.WORKER_PLANNING: # collect rollouts plan = self._plan(next_task[1]) self._task_queue.task_done() self._result_queue.put(plan) elif next_task[0] == parallel_util.WORKER_PLAYING: # collect rollouts traj_episode = self._play(next_task[1]) self._task_queue.task_done() self._result_queue.put(traj_episode) elif next_task[0] == parallel_util.WORKER_RATE_ACTIONS: # predict reward of a sequence of action reward = self._rate_action(next_task[1]) self._task_queue.task_done() self._result_queue.put(reward) elif next_task[0] == parallel_util.WORKER_GET_MODEL: # collect the gradients data_id = next_task[1]['data_id'] if next_task[1]['type'] == 'dynamics_derivative': model_data = self._dynamics_derivative( next_task[1]['data_dict'], next_task[1]['target']) elif next_task[1]['type'] == 'reward_derivative': model_data = self._reward_derivative( next_task[1]['data_dict'], next_task[1]['target']) elif next_task[1]['type'] == 'forward_model': # get the next state model_data = self._dynamics(next_task[1]['data_dict']) model_data.update(self._reward(next_task[1]['data_dict'])) if next_task[1]['end_of_traj']: # get the start reward for the initial state model_data['end_reward'] = self._reward({ 'start_state': model_data['end_state'], 'action': next_task[1]['data_dict']['action'] * 0.0 })['reward'] else: assert False self._task_queue.task_done() self._result_queue.put({ 'data': model_data, 'data_id': data_id }) elif next_task[0] == parallel_util.AGENT_SET_WEIGHTS: # set parameters of the actor policy self._set_weights(next_task[1]) time.sleep(0.001) # yield the process self._task_queue.task_done() elif next_task[0] == parallel_util.END_ROLLOUT_SIGNAL or \ next_task[0] == parallel_util.END_SIGNAL: # kill all the thread logger.info("kill message for worker {}".format( self._worker_id)) # logger.info("kill message for worker") self._task_queue.task_done() break else: logger.error('Invalid task type {}'.format(next_task[0])) return
def visualize_sol_pose(physics_engine, output_dir, data_dict, env_name, iteration, sub_iter): """ @brief: visualize the following four images 0. the rendered image from dm_control 1. the image using the qpos + camera_state (trained) 2. the image using the qpos + gt_camera_state 3. the image using the (gt_qpos + gt_camera_state) """ logger.info("generating the visualization") image_size = int(data_dict['gt']['camera_state'][-1]) assert image_size == int(data_dict['gt']['camera_state'][-1]) # TODO # from camera and qposes to the 2d poses for qpos_key, camera_state_key in \ [['gt', 'gt'], ['sol', 'sol'], ['sol', 'gt']]: pose_2d_key = qpos_key + "-" + camera_state_key data_dict[pose_2d_key] = {} # save pose_2d is_trackcom = data_dict[camera_state_key]['mode'] == 'trackcom' pose_3d, center_of_mass = physics_engine.get_pose3d( data_dict[qpos_key]['qpos'], get_center_of_mass=is_trackcom) matrix = physics_engine.camera_matrix_from_state( data_dict[camera_state_key]['camera_state'], center_of_mass) data_dict[pose_2d_key]['pose_2d'] = \ physics_engine.get_projected_2dpose(pose_3d, matrix) data_dict[pose_2d_key]['image_size'] = \ data_dict[camera_state_key]['camera_state'][-1] pos_connection = POS_CONNECTION[env_name] # the output directory directory = os.path.join(output_dir, "video") if not os.path.exists(directory): os.mkdir(directory) output_dir = os.path.join( directory, "pos_Iter_" + str(iteration) + '_sub_' + str(sub_iter) + '.mp4') video = cv2.VideoWriter( os.path.join(init_path.get_abs_base_dir(), output_dir), cv2.VideoWriter_fourcc(*'mp4v'), 40, (image_size * 4, image_size)) for i_pos_id in range(len(data_dict['gt']['qpos'])): # render the image using the default renderer render_image = physics_engine._env.render( camera_id=0, qpos=data_dict['gt']['qpos'][i_pos_id]) # the sol_qpos + sol_camera_state sol_sol_image = draw_pose3d(render_image * 0.0, data_dict['sol-sol']['pose_2d'][i_pos_id], pos_connection) # the sol_qpos + gt_camera_state sol_gt_image = draw_pose3d(render_image * 0.0, data_dict['sol-gt']['pose_2d'][i_pos_id], pos_connection) # the gt_qpos + gt_camera_state gt_gt_image = draw_pose3d(render_image * 0.0, data_dict['gt-gt']['pose_2d'][i_pos_id], pos_connection) image = \ np.hstack([render_image, sol_sol_image, sol_gt_image, gt_gt_image]) # import pdb; pdb.set_trace() print('Processing %d out of %d' % (i_pos_id, len(data_dict['gt']['qpos']))) video.write(np.array(image[:, :, [2, 1, 0]], dtype=np.uint8)) video.release()
def train(self, data_dict, replay_buffer, training_info={}): # make sure the needed data is ready assert 'plan_data' in training_info self._plan_data = training_info['plan_data'] self._set_whitening_var(data_dict['whitening_stats']) # step 1: get the target action mean and target precision matrix ''' assert len(self._plan_data) == self._num_traj and \ len(self._plan_data[0]['new_u']) == self._traj_depth num_data = len(self._plan_data) * len(self._plan_data[0]['u']) ''' ''' target_mu = np.zeros([num_data, self._action_size]) target_precision = np.ones([num_data, self._action_size, self._action_size]) ''' training_data, num_data = self._get_training_dataset(data_dict) # step 2: train the mean of the action if num_data < self.args.policy_sub_batch_size: logger.warning("Not enough data!") return {} batch_per_epoch = num_data // self.args.policy_sub_batch_size feed_dict = { self._input_ph['target_action_mu']: training_data['target_mu'], self._input_ph['target_precision']: training_data['target_precision'], self._input_ph['start_state']: training_data['start_state'] } for i_iteration in range(self.args.policy_epochs): data_id = range(num_data) self._npr.shuffle(data_id) avg_loss = [] for i_batch in range(batch_per_epoch): batch_idx = data_id[i_batch * self.args.policy_sub_batch_size:(i_batch + 1) * self.args.policy_sub_batch_size] sub_feed_dict = { key: feed_dict[key][batch_idx] for key in feed_dict } fetch_dict = { 'update_op': self._update_operator['update_op'], 'loss': self._update_operator['loss'] } training_stat = self._session.run(fetch_dict, sub_feed_dict) avg_loss.append(training_stat['loss']) ''' for i_ in range(10000): fetch_dict['pred_act'] = self._tensor['action_dist_mu'] training_stat = self._session.run(fetch_dict, sub_feed_dict) if i_ % 10 == 0: import matplotlib.pyplot as plt print training_stat ga = sub_feed_dict[self._input_ph['target_action_mu']].flatten() plt.plot(ga, label='target') pa = training_stat['pred_act'].flatten() plt.plot(pa, label='pred') plt.legend() plt.show() from util.common.fpdb import fpdb; fpdb().set_trace() ''' logger.info('GPS policy loss {}'.format(np.mean(avg_loss))) # the covariance of the controller self._policy_cov_data['inv_cov'] = \ np.mean(training_data['target_precision'], 0) + \ self.args.gps_policy_cov_damping * \ np.ones([self._action_size, self._action_size]) # self._policy_cov_data['precision'] = \ # np.diag(self._policy_cov_data['inv_cov']) # self._policy_cov_data['cov'] = \ # np.diag(1.0 / self._policy_cov_data['precision']) self._policy_cov_data['var'] = \ 1 / np.diag(self._policy_cov_data['inv_cov']) # vec self._policy_cov_data['sig'] = \ np.diag(self._policy_cov_data['var']) # matrix self._policy_cov_data['chol_pol_covar'] = \ np.diag(np.sqrt(self._policy_cov_data['var'])) # matrix self._policy_cov_data['flat_cov_L'][:] = \ np.diag(self._policy_cov_data['chol_pol_covar']) # vec return training_stat
if args.gt_reward: from mbbl.network.reward.groundtruth_reward import reward_network else: from mbbl.network.reward.deterministic_reward import reward_network mb_timesteps, policy_weight = train_mb(mbmf_trainer, mbmf_sampler, mbmf_worker, dynamics_network, mbmf_policy_network, reward_network, args) tf.reset_default_graph() print('==================TRPO starts at==================') # Manully set the bs to 50K. # args.timesteps_per_batch = 50000 # args.policy_batch_size = 50000 logger.info("batch size for trpo is {}".format(args.timesteps_per_batch)) from mbbl.sampler import singletask_sampler from mbbl.worker import mf_worker # from mbbl.network.policy.trpo_policy import policy_network import mbbl.network.policy.trpo_policy import mbbl.network.policy.ppo_policy policy_network = { 'ppo': mbbl.network.policy.ppo_policy.policy_network, 'trpo': mbbl.network.policy.trpo_policy.policy_network }[args.trust_region_method] # here the dynamics and reward are simply placeholders, which cannot be # called to pred next state or reward from mbbl.network.dynamics.base_dynamics import base_dynamics_network
def train_mb(trainer, sampler, worker, dynamics, policy, reward, args=None): logger.info('Training starts at {}'.format(init_path.get_abs_base_dir())) network_type = {'policy': policy, 'dynamics': dynamics, 'reward': reward} # make the trainer and sampler sampler_agent = make_sampler(sampler, worker, network_type, args) trainer_tasks, trainer_results, trainer_agent, init_weights = \ make_trainer(trainer, network_type, args) sampler_agent.set_weights(init_weights) timer_dict = OrderedDict() timer_dict['Program Start'] = time.time() totalsteps = 0 current_iteration = 0 init_data = {} # Start mb training. while True: timer_dict['** Program Total Time **'] = time.time() # step 1: collect rollout data if current_iteration == 0 and args.random_timesteps > 0 and \ (not (args.gt_dynamics and args.gt_reward)): # we could first generate random rollout data for exploration logger.info( 'Generating {} random timesteps'.format(args.random_timesteps) ) rollout_data = sampler_agent.rollouts_using_worker_planning( args.random_timesteps, use_random_action=True ) else: rollout_data = sampler_agent.rollouts_using_worker_planning() timer_dict['Generate Rollout'] = time.time() # step 2: train the weights for dynamics and policy network training_info = {'network_to_train': ['dynamics', 'reward']} trainer_tasks.put( (parallel_util.TRAIN_SIGNAL, {'data': rollout_data['data'], 'training_info': training_info}) ) trainer_tasks.join() training_return = trainer_results.get() timer_dict['Train Weights'] = time.time() # step 4: update the weights sampler_agent.set_weights(training_return['network_weights']) timer_dict['Assign Weights'] = time.time() # log and print the results log_results(training_return, timer_dict) for key in rollout_data.keys(): if key not in init_data.keys(): init_data[key] = [] init_data[key].extend(rollout_data[key]) # Add noise to initial data to encourge trpo to explore. import numpy as np for i_rollout in init_data['data']: action = i_rollout['actions'] i_rollout['actions'] += np.random.normal(scale=0.005, size=action.shape) if totalsteps > args.max_timesteps or \ training_return['replay_buffer'].get_current_size() > \ args.mb_timesteps: break else: current_iteration += 1 totalsteps = training_return['totalsteps'] # Initilize policy network training_info = {'network_to_train': ['reward', 'policy']} trainer_tasks.put( (parallel_util.MBMF_INITIAL, {'data': init_data['data'], 'training_info': training_info}) ) trainer_tasks.join() training_return = trainer_results.get() timer_dict['Train Weights'] = time.time() # Start dagger iteration. for dagger_i in range(args.dagger_iter): print('=================Doing dagger iteration {}=================='. format(dagger_i)) # Collect on policy rollout. rollout_data = sampler_agent.rollouts_using_worker_playing( num_timesteps=args.dagger_timesteps_per_iter, use_true_env=True) sampler_agent.dagger_rollouts(rollout_data['data']) init_data['data'] += rollout_data['data'] trainer_tasks.put( (parallel_util.MBMF_INITIAL, {'data': init_data['data'], 'training_info': training_info}) ) trainer_tasks.join() training_return = trainer_results.get() trainer_tasks.put((parallel_util.GET_POLICY_WEIGHT, None)) trainer_tasks.join() policy_weight = trainer_results.get() # end of training sampler_agent.end() trainer_tasks.put((parallel_util.END_SIGNAL, None)) return totalsteps, policy_weight