def optimize_policy(self, all_samples_data, log=True): """ Performs MAML outer step Args: all_samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and meta task log (bool) : whether to log statistics Returns: None """ meta_op_input_dict = self._extract_input_dict_meta_op( all_samples_data, self._optimization_keys) if log: logger.log("Optimizing") loss_before = self.optimizer.optimize( input_val_dict=meta_op_input_dict) if log: logger.log("Computing statistics") loss_after = self.optimizer.loss(input_val_dict=meta_op_input_dict) if log: logger.logkv('LossBefore', loss_before) logger.logkv('LossAfter', loss_after)
def log_diagnostics(self, paths, prefix=''): dist = [-path["env_infos"]['reward_dist'] for path in paths] final_dist = [-path["env_infos"]['reward_dist'][-1] for path in paths] # ctrl_cost = [-path["env_infos"]['reward_ctrl'] for path in paths] logger.logkv(prefix + 'AvgDistance', np.mean(dist)) logger.logkv(prefix + 'AvgFinalDistance', np.mean(final_dist))
def log_diagnostics(self, paths, prefix=''): """ Log extra information per iteration based on the collected paths """ log_stds = np.vstack( [path["agent_infos"]["log_std"] for path in paths]) logger.logkv(prefix + 'AveragePolicyStd', np.mean(np.exp(log_stds)))
def step(self): time_step = time.time() """ -------------------- Sampling --------------------------""" if self.verbose: logger.log("Policy is obtaining samples ...") paths = self.model_sampler.obtain_samples(log=True, log_prefix='Policy-') """ ----------------- Processing Samples ---------------------""" if self.verbose: logger.log("Policy is processing samples ...") samples_data = self.model_sample_processor.process_samples( paths, log='all', log_prefix='Policy-') if type(paths) is list: self.log_diagnostics(paths, prefix='Policy-') else: self.log_diagnostics(sum(paths.values(), []), prefix='Policy-') """ ------------------ Policy Update ---------------------""" if self.verbose: logger.log("Policy optimization...") # This needs to take all samples_data so that it can construct graph for meta-optimization. self.algo.optimize_policy(samples_data, log=True, verbose=self.verbose, prefix='Policy-') self.policy = self.model_sampler.policy time_step = time.time() - time_step logger.logkv('Policy-TimeStep', time_step)
def _synch(self, samples_data_arr, check_init=False): time_synch = time.time() if self.verbose: logger.log('Model at {} is synchronizing...'.format( self.itr_counter)) obs = np.concatenate([ samples_data['observations'] for samples_data in samples_data_arr ]) act = np.concatenate( [samples_data['actions'] for samples_data in samples_data_arr]) obs_next = np.concatenate([ samples_data['next_observations'] for samples_data in samples_data_arr ]) self.dynamics_model.update_buffer( obs=obs, act=act, obs_next=obs_next, check_init=check_init, ) # Reset variables for early stopping condition self.with_new_data = True self.remaining_model_idx = list(range(self.dynamics_model.num_models)) self.valid_loss_rolling_average = None time_synch = time.time() - time_synch logger.logkv('Model-TimeSynch', time_synch)
def step(self, random=False): time_step = time.time() '''------------- Obtaining samples from the environment -----------''' if self.verbose: logger.log("Data is obtaining samples...") env_paths = self.env_sampler.obtain_samples( log=True, random=random, log_prefix='Data-EnvSampler-', ) '''-------------- Processing environment samples -------------------''' if self.verbose: logger.log("Data is processing environment samples...") samples_data = self.dynamics_sample_processor.process_samples( env_paths, log=True, log_prefix='Data-EnvTrajs-', ) self.samples_data_arr.append(samples_data) time_step = time.time() - time_step time_sleep = max(self.simulation_sleep - time_step, 0) time.sleep(time_sleep) logger.logkv('Data-TimeStep', time_step) logger.logkv('Data-TimeSleep', time_sleep)
def push(self): time_push = time.time() self.queue_next.put(pickle.dumps(self.samples_data_arr)) self.samples_data_arr = [] time_push = time.time() - time_push logger.logkv('Data-TimePush', time_push)
def optimize_policy(self, all_samples_data, log=True): """ Performs MAML outer step Args: all_samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and meta task log (bool) : whether to log statistics Returns: None """ meta_op_input_dict = self._extract_input_dict_meta_op( all_samples_data, self._optimization_keys) logger.log("Computing KL before") mean_kl_before = self.optimizer.constraint_val(meta_op_input_dict) logger.log("Computing loss before") loss_before = self.optimizer.loss(meta_op_input_dict) logger.log("Optimizing") self.optimizer.optimize(meta_op_input_dict) logger.log("Computing loss after") loss_after = self.optimizer.loss(meta_op_input_dict) logger.log("Computing KL after") mean_kl = self.optimizer.constraint_val(meta_op_input_dict) if log: logger.logkv('MeanKLBefore', mean_kl_before) logger.logkv('MeanKL', mean_kl) logger.logkv('LossBefore', loss_before) logger.logkv('LossAfter', loss_after) logger.logkv('dLoss', loss_before - loss_after)
def _synch(self, policy_state_pickle): time_synch = time.time() policy_state = pickle.loads(policy_state_pickle) assert isinstance(policy_state, dict) self.env_sampler.policy.set_shared_params(policy_state) time_synch = time.time() - time_synch logger.logkv('Data-TimeSynch', time_synch)
def obtain_samples(self, log=False, log_prefix='', buffer=None): """ Collect batch_size trajectories from each task Args: log (boolean): whether to log sampling times log_prefix (str) : prefix for logger random (boolean): whether the actions are random Returns: (dict) : A dict of paths of size [meta_batch_size] x (batch_size) x [5] x (max_path_length) """ # initial setup / preparation policy = self.policy policy.reset(dones=[True] * self.num_rollouts) # initial reset of meta_envs init_obses = np.array( [self.env.reset() for _ in range(self.num_rollouts)]) sess = tf.get_default_session() observations, actions, means, log_stds, rewards = sess.run( [ self._observations_var, self._actions_var, self._means_var, self._log_stds_var, self._rewards_var ], feed_dict={self._initial_obs_ph: init_obses}) means = np.array(means).transpose((1, 0, 2)) log_stds = np.array(log_stds).transpose((1, 0, 2)) if log_stds.shape[0] == 1: log_stds = np.repeat(log_stds, self.num_rollouts, axis=0) agent_infos = [ dict(mean=mean, log_std=log_std) for mean, log_std in zip(means, log_stds) ] observations = np.array(observations).transpose((1, 0, 2)) actions = np.array(actions).transpose((1, 0, 2)) rewards = np.array(rewards).T dones = [[False for _ in range(self.max_path_length)] for _ in range(self.num_rollouts)] env_infos = [dict() for _ in range(self.num_rollouts)] paths = [ dict(observations=obs, actions=act, rewards=rew, dones=done, env_infos=env_info, agent_infos=agent_info) for obs, act, rew, done, env_info, agent_info in zip( observations, actions, rewards, dones, env_infos, agent_infos) ] self.total_timesteps_sampled += self.total_samples logger.logkv('ModelSampler-n_timesteps', self.total_timesteps_sampled) return paths
def log_diagnostics(self, paths, prefix=''): reach_rew = [path["env_infos"]['reachRew'] for path in paths] pick_rew = [path["env_infos"]['pickRew'][-1] for path in paths] place_rew = [path["env_infos"]['placeRew'] for path in paths] reach_dist = [path["env_infos"]['reachDist'] for path in paths] placing_dist = [path["env_infos"]['placingDist'] for path in paths] logger.logkv(prefix + 'AverageReachReward', np.mean(reach_rew)) logger.logkv(prefix + 'AveragePickReward', np.mean(pick_rew)) logger.logkv(prefix + 'AveragePlaceReward', np.mean(place_rew)) logger.logkv(prefix + 'AverageReachDistance', np.mean(reach_dist)) logger.logkv(prefix + 'AveragePlaceDistance', np.mean(placing_dist))
def log_diagnostics(self, paths, prefix=''): progs = [ np.mean(path["env_infos"]["reward_forward"]) for path in paths ] ctrl_cost = [ -np.mean(path["env_infos"]["reward_ctrl"]) for path in paths ] logger.logkv(prefix + 'AverageForwardReturn', np.mean(progs)) logger.logkv(prefix + 'MaxForwardReturn', np.max(progs)) logger.logkv(prefix + 'MinForwardReturn', np.min(progs)) logger.logkv(prefix + 'StdForwardReturn', np.std(progs)) logger.logkv(prefix + 'AverageCtrlCost', np.mean(ctrl_cost))
def optimize_policy(self, buffer, timestep, grad_steps, log=True): sess = tf.get_default_session() for i in range(grad_steps): feed_dict = create_feed_dict(placeholder_dict=self.op_phs_dict, value_dict=buffer.random_batch( self.sampler_batch_size)) sess.run(self.training_ops, feed_dict) if log: diagnostics = sess.run({**self.diagnostics_ops}, feed_dict) for k, v in diagnostics.items(): logger.logkv(k, v) if timestep % self.target_update_interval == 0: self._update_target()
def _synch(self, dynamics_model_state_pickle): time_synch = time.time() if self.verbose: logger.log('Policy is synchronizing...') dynamics_model_state = pickle.loads(dynamics_model_state_pickle) assert isinstance(dynamics_model_state, dict) self.model_sampler.dynamics_model.set_shared_params( dynamics_model_state) if hasattr(self.model_sampler, 'vec_env'): self.model_sampler.vec_env.dynamics_model.set_shared_params( dynamics_model_state) time_synch = time.time() - time_synch logger.logkv('Policy-TimeSynch', time_synch)
def optimize_policy(self, samples_data, log=True, prefix='', verbose=False): """ Performs MAML outer step Args: samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and meta task log (bool) : whether to log statistics Returns: None """ input_dict = self._extract_input_dict(samples_data, self._optimization_keys, prefix='train') entropy_loss, reward_loss = self.optimizer.compute_loss_variations( input_dict, self.entropy_loss, self.reward_loss, self.log_values) if verbose: logger.log("Optimizing") # Update model loss_before = self.optimizer.optimize(input_val_dict=input_dict) if verbose: logger.log("Computing statistics") loss_after = self.optimizer.loss(input_val_dict=input_dict) if log: logger.logkv(prefix + 'Loss/LossBefore', loss_before) logger.logkv(prefix + 'Loss/LossAfter', loss_after) logger.logkv(prefix + 'Loss/PartialLossEntropy', entropy_loss) logger.logkv(prefix + 'Loss/PartialLossReward', reward_loss)
def push(self): time_push = time.time() state_pickle = pickle.dumps( self.dynamics_model.get_shared_param_values()) assert state_pickle is not None while self.queue_next.qsize() > 5: try: logger.log('Model is off loading data from queue_next...') _ = self.queue_next.get_nowait() except Empty: break self.queue_next.put(state_pickle) time_push = time.time() - time_push logger.logkv('Model-TimePush', time_push)
def optimize_policy(self, all_samples_data, log=True): """ Performs MAML outer step Args: all_samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and meta task log (bool) : whether to log statistics Returns: None """ meta_op_input_dict = self._extract_input_dict_meta_op( all_samples_data, self._optimization_keys) # add kl_coeffs / clip_eps to meta_op_input_dict meta_op_input_dict['inner_kl_coeff'] = self.inner_kl_coeff if self.clip_outer: meta_op_input_dict['clip_eps'] = self.clip_eps else: meta_op_input_dict['outer_kl_coeff'] = self.outer_kl_coeff if log: logger.log("Optimizing") loss_before = self.optimizer.optimize( input_val_dict=meta_op_input_dict) if log: logger.log("Computing statistics") loss_after, inner_kls, outer_kl = self.optimizer.compute_stats( input_val_dict=meta_op_input_dict) if self.adaptive_inner_kl_penalty: if log: logger.log("Updating inner KL loss coefficients") self.inner_kl_coeff = self.adapt_kl_coeff(self.inner_kl_coeff, inner_kls, self.target_inner_step) if self.adaptive_outer_kl_penalty: if log: logger.log("Updating outer KL loss coefficients") self.outer_kl_coeff = self.adapt_kl_coeff(self.outer_kl_coeff, outer_kl, self.target_outer_step) if log: logger.logkv('LossBefore', loss_before) logger.logkv('LossAfter', loss_after) logger.logkv('KLInner', np.mean(inner_kls)) logger.logkv('KLCoeffInner', np.mean(self.inner_kl_coeff)) if not self.clip_outer: logger.logkv('KLOuter', outer_kl)
def push(self): time_push = time.time() policy_state_pickle = pickle.dumps( self.policy.get_shared_param_values()) assert policy_state_pickle is not None while self.queue_next.qsize() > 5: try: logger.log('Policy is off loading data from queue_next...') _ = self.queue_next.get_nowait() except Empty: # very rare chance to reach here break self.queue_next.put(policy_state_pickle) time_push = time.time() - time_push logger.logkv('Policy-TimePush', time_push)
def step(self): time_step = time.time() ''' --------------- MAML steps --------------- ''' self.policy.switch_to_pre_update() # Switch to pre-update policy all_samples_data = [] for step in range(self.num_inner_grad_steps + 1): if self.verbose: logger.log("Policy Adaptation-Step %d **" % step) """ -------------------- Sampling --------------------------""" #time_sampling = time.time() paths = self.model_sampler.obtain_samples(log=True, log_prefix='Policy-', buffer=None) #time_sampling = time.time() - time_sampling """ ----------------- Processing Samples ---------------------""" #time_sample_proc = time.time() samples_data = self.model_sample_processor.process_samples( paths, log='all', log_prefix='Policy-') all_samples_data.append(samples_data) #time_sample_proc = time.time() - time_sample_proc self.log_diagnostics(sum(list(paths.values()), []), prefix='Policy-') """ ------------------- Inner Policy Update --------------------""" #time_algo_adapt = time.time() if step < self.num_inner_grad_steps: self.algo._adapt(samples_data) #time_algo_adapt = time.time() - time_algo_adapt """ ------------------ Outer Policy Update ---------------------""" if self.verbose: logger.log("Policy is optimizing...") # This needs to take all samples_data so that it can construct graph for meta-optimization. #time_algo_opt = time.time() self.algo.optimize_policy(all_samples_data, prefix='Policy-') #time_algo_opt = time.time() - time_algo_opt time_step = time.time() - time_step self.policy = self.model_sampler.policy logger.logkv('Policy-TimeStep', time_step)
def optimize_policy(self, samples_data, log=True, prefix='', verbose=False): """ Performs MAML outer step Args: samples_data (list) : list of lists of lists of samples (each is a dict) split by gradient update and meta task log (bool) : whether to log statistics Returns: None """ input_dict = self._extract_input_dict(samples_data, self._optimization_keys, prefix='train') if verbose: logger.log("Computing KL before") mean_kl_before = self.optimizer.constraint_val( input_val_dict=input_dict) if verbose: logger.log("Computing loss before") loss_before = self.optimizer.loss(input_val_dict=input_dict) if verbose: logger.log("Optimizing") self.optimizer.optimize(input_val_dict=input_dict) if verbose: logger.log("Computing loss after") loss_after = self.optimizer.loss(input_val_dict=input_dict) if verbose: logger.log("Computing KL after") mean_kl = self.optimizer.constraint_val(input_val_dict=input_dict) if log: logger.logkv(prefix + 'MeanKLBefore', mean_kl_before) logger.logkv(prefix + 'MeanKL', mean_kl) logger.logkv(prefix + 'LossBefore', loss_before) logger.logkv(prefix + 'LossAfter', loss_after) logger.logkv(prefix + 'dLoss', loss_before - loss_after)
def step(self, random=False): time_step = time.time() '''------------- Obtaining samples from the environment -----------''' if self.verbose: logger.log("Data is obtaining samples...") env_paths = self.env_sampler.obtain_samples( log=True, random=random, log_prefix='Data-EnvSampler-', ) '''-------------- Processing environment samples -------------------''' if self.verbose: logger.log("Data is processing samples...") if type(env_paths) is dict or type(env_paths) is OrderedDict: env_paths = list(env_paths.values()) idxs = np.random.choice(range(len(env_paths)), size=self.num_rollouts_per_iter, replace=False) env_paths = sum([env_paths[idx] for idx in idxs], []) elif type(env_paths) is list: idxs = np.random.choice(range(len(env_paths)), size=self.num_rollouts_per_iter, replace=False) env_paths = [env_paths[idx] for idx in idxs] else: raise TypeError samples_data = self.dynamics_sample_processor.process_samples( env_paths, log=True, log_prefix='Data-EnvTrajs-', ) self.samples_data_arr.append(samples_data) time_step = time.time() - time_step time_sleep = max(self.simulation_sleep - time_step, 0) time.sleep(time_sleep) logger.logkv('Data-TimeStep', time_step) logger.logkv('Data-TimeSleep', time_sleep)
def train(self): """ Trains policy on env using algo """ worker_data_queue, worker_model_queue, worker_policy_queue = self.queues worker_data_remote, worker_model_remote, worker_policy_remote = self.remotes for p in self.ps: p.start() ''' --------------- worker warm-up --------------- ''' logger.log('Prepare start...') worker_data_remote.send('prepare start') worker_data_queue.put(self.initial_random_samples) assert worker_data_remote.recv() == 'loop ready' worker_model_remote.send('prepare start') assert worker_model_remote.recv() == 'loop ready' worker_policy_remote.send('prepare start') assert worker_policy_remote.recv() == 'loop ready' time_total = time.time() ''' --------------- worker looping --------------- ''' logger.log('Start looping...') for remote in self.remotes: remote.send('start loop') ''' --------------- collect info --------------- ''' for remote in self.remotes: assert remote.recv() == 'loop done' logger.log('\n------------all workers exit loops -------------') for remote in self.remotes: assert remote.recv() == 'worker closed' for p in self.ps: p.terminate() logger.logkv('Trainer-TimeTotal', time.time() - time_total) logger.dumpkvs() logger.log("*****Training finished")
def process_queue(self): do_push = 0 samples_data_arr = [] while True: try: if not self.remaining_model_idx: logger.log( 'Model at iteration {} is block waiting for data'. format(self.itr_counter)) # FIXME: check stop_cond time_wait = time.time() samples_data_arr_pickle = self.queue.get() time_wait = time.time() - time_wait logger.logkv('Model-TimeBlockWait', time_wait) self.remaining_model_idx = list( range(self.dynamics_model.num_models)) else: if self.verbose: logger.log('Model try get_nowait.........') samples_data_arr_pickle = self.queue.get_nowait() if samples_data_arr_pickle == 'push': # Only push once before executing another step if do_push == 0: do_push = 1 self.push() else: samples_data_arr.extend( pickle.loads(samples_data_arr_pickle)) except Empty: break do_synch = len(samples_data_arr) if do_synch: self._synch(samples_data_arr) do_step = 1 if self.verbose: logger.log( 'Model finishes processing queue with {}, {}, {}......'.format( do_push, do_synch, do_step)) return do_push, do_synch, do_step
def step(self, obs=None, act=None, obs_next=None): time_model_fit = time.time() """ --------------- fit dynamics model --------------- """ if self.verbose: logger.log( 'Model at iteration {} is training for one epoch...'.format( self.itr_counter)) self.remaining_model_idx, self.valid_loss_rolling_average = self.dynamics_model.fit_one_epoch( remaining_model_idx=self.remaining_model_idx, valid_loss_rolling_average_prev=self.valid_loss_rolling_average, with_new_data=self.with_new_data, verbose=self.verbose, log_tabular=True, prefix='Model-', ) self.with_new_data = False time_model_fit = time.time() - time_model_fit logger.logkv('Model-TimeStep', time_model_fit)
def optimize_supervised(self, samples_data, log=True, prefix='', verbose=False): input_dict = self._extract_input_dict(samples_data, self._optimization_keys, prefix='train') self.optimizer_s.compute_loss_variations(input_dict, None, None, self.log_values_sup) if verbose: logger.log("Optimizing Supervised Model") loss_before = self.optimizer_s.optimize(input_val_dict=input_dict) if verbose: logger.log("Computing statistics") loss_after = self.optimizer_s.loss(input_val_dict=input_dict) if log: logger.logkv(prefix + 'SupervisedLossBefore', loss_before) logger.logkv(prefix + 'SupervisedLossAfter', loss_after)
def log_diagnostics(self, paths, prefix=''): fwrd_vel = [path["env_infos"]['reward_run'] for path in paths] final_fwrd_vel = [path["env_infos"]['reward_run'][-1] for path in paths] ctrl_cost = [-path["env_infos"]['reward_ctrl'] for path in paths] logger.logkv(prefix + 'AvgForwardVel', np.mean(fwrd_vel)) logger.logkv(prefix + 'AvgFinalForwardVel', np.mean(final_fwrd_vel)) logger.logkv(prefix + 'AvgCtrlCost', np.mean(ctrl_cost))
def log_diagnostics(paths, prefix=''): forward_vel = [ np.mean(path['env_infos']['forward_vel']) for path in paths ] ctrl_cost = [ np.mean(path['env_infos']['control_cost']) for path in paths ] # stability_cost = [np.mean(path['env_infos']['stability_cost']) for path in paths] path_length = [path["observations"].shape[0] for path in paths] logger.logkv(prefix + 'AvgForwardVel', np.mean(forward_vel)) logger.logkv(prefix + 'StdForwardVel', np.std(forward_vel)) logger.logkv(prefix + 'AvgCtrlCost', np.mean(ctrl_cost)) # logger.logkv(prefix + 'AvgStabilityCost', np.mean(stability_cost)) logger.logkv(prefix + 'AvgPathLength', np.mean(path_length))
def log_diagnostics(self, paths, prefix=''): reach_dist = [path["env_infos"]['reachDist'] for path in paths] placing_dist = [path["env_infos"]['placeDist'] for path in paths] cos_dist = [path["env_infos"]['cosDist'] for path in paths] logger.logkv(prefix + 'AverageReachDistance', np.mean(reach_dist)) logger.logkv(prefix + 'AveragePlaceDistance', np.mean(placing_dist)) logger.logkv(prefix + 'AverageCosDistance', np.mean(cos_dist))
def step(self, random_sinusoid=(False, False)): time_step = time.time() if self.itr_counter == 1 and self.env_sampler.policy.dynamics_model.normalization is None: if self.verbose: logger.log('Data starts first step...') self.env_sampler.policy.dynamics_model = pickle.loads( self.queue.get()) if self.verbose: logger.log('Data first step done...') '''------------- Obtaining samples from the environment -----------''' if self.verbose: logger.log("Data is obtaining samples...") env_paths = self.env_sampler.obtain_samples( log=True, random=random_sinusoid[0], sinusoid=random_sinusoid[1], log_prefix='Data-EnvSampler-', ) '''-------------- Processing environment samples -------------------''' if self.verbose: logger.log("Data is processing samples...") samples_data = self.dynamics_sample_processor.process_samples( env_paths, log=True, log_prefix='Data-EnvTrajs-', ) self.samples_data_arr.append(samples_data) time_step = time.time() - time_step time_sleep = max(self.simulation_sleep - time_step, 0) time.sleep(time_sleep) logger.logkv('Data-TimeStep', time_step) logger.logkv('Data-TimeSleep', time_sleep)
def run_supervised(self, policy, teacher_dict, tag): paths = self.sampler.obtain_samples(log=False, advance_curriculum=False, policy=policy, teacher_dict=teacher_dict, max_action=False) # TODO: consider adding a flag for max_action samples_data = self.sample_processor.process_samples(paths, log='all', log_prefix=tag, log_teacher=self.train_with_teacher) advance_curriculum, avg_success, avg_accuracy = self.check_advance_curriculum_rollout(samples_data) logger.logkv(f"{tag}Advance", int(advance_curriculum)) logger.logkv(f"{tag}AvgSuccess", avg_success) logger.logkv(f"{tag}AvgAccuracy", avg_accuracy) return advance_curriculum