def obtain_samples(self, itr): # print("obtain samples in batch_polopt") cur_params = self.algo.policy.get_param_values() # a list of numbers try: cur_low_params = self.algo.low_policy.get_param_values() # env_params = cur_low_params if self.algo.train_low else None # need to reset low policy only when training low! paths = parallel_sampler.sample_paths( policy_params=cur_params, low_policy_params= cur_low_params, # low policy params as env params! env_params=[self.algo.env.time_steps_agg, self.algo], # the parameters to recover for env! max_samples=self.algo.batch_size, max_path_length=self.algo.max_path_length, scope=self.algo.scope, ) except AttributeError: paths = parallel_sampler.sample_paths( policy_params=cur_params, max_samples=self.algo.batch_size, max_path_length=self.algo.max_path_length, scope=self.algo.scope, ) if self.algo.whole_paths: # this line is run (whole path) return paths else: paths_truncated = parallel_sampler.truncate_paths( paths, self.algo.batch_size) return paths_truncated
def obtain_samples(self, itr, reset_args=None, return_dict=False, log_prefix=''): init_policy_params = cur_policy_params = self.algo.policy.get_param_values() if hasattr(self.algo.env,"get_param_values"): try: cur_env_params = self.algo.env.get_param_values() except: cur_env_params = None else: cur_env_params = None import time start = time.time() # first, a naive implementation. if type(reset_args) != list and type(reset_args)!=np.ndarray: reset_args = [reset_args]*self.n_envs if self.algo.policy.all_param_vals: cur_policy_params = [flatten_tensors(x.values()) for x in self.algo.policy.all_param_vals] else: cur_policy_params = [cur_policy_params]*self.n_envs # assume that n_envs = num parallel if self.n_envs == parallel_sampler.singleton_pool.n_parallel: raise NotImplementedError('this implementation is buggy.') # 1 thread per env paths = parallel_sampler.sample_paths( policy_params=cur_policy_params, env_params=cur_env_params, max_samples=self.algo.batch_size, max_path_length=self.algo.max_path_length, scope=self.algo.scope, reset_arg=reset_args, show_prog_bar=True, multi_task=True, ) else: # do tasks sequentially and parallelize within rollouts per task. paths = {} for i in range(self.n_envs): paths[i] = parallel_sampler.sample_paths( policy_params=cur_policy_params[i], env_params=cur_env_params, max_samples=self.algo.batch_size / self.n_envs, max_path_length=self.algo.max_path_length, scope=self.algo.scope, reset_arg=reset_args[i], show_prog_bar=False, ) total_time = time.time() - start logger.record_tabular(log_prefix+"TotalExecTime", total_time) if not return_dict: flatten_list = lambda l: [item for sublist in l for item in sublist] paths = flatten_list(paths.values()) self.algo.policy.set_param_values(init_policy_params) # currently don't support not whole paths (if desired, truncate paths) assert self.algo.whole_paths return paths
def obtain_samples(self, itr): cur_params = self.algo.policy.get_param_values() paths = parallel_sampler.sample_paths( policy_params=cur_params, max_samples=self.algo.batch_size, max_path_length=self.algo.max_path_length, scope=self.algo.scope, ) for path in paths: logli = self.algo.policy.distribution.log_likelihood(path["actions"],path["agent_infos"]) path["log_likelihood"] = logli if not(self.algo.all_paths): paths = local_truncate_paths(paths, self.algo.batch_size) self.env_interacts = sum([len(path["rewards"]) for path in paths]) self.total_env_interacts += self.env_interacts self.mean_path_len = float(self.env_interacts)/len(paths) self.experience_replay.append(paths) self.env_interacts_memory.append(self.env_interacts) if len(self.experience_replay) > self.algo.batch_aggregate_n: self.experience_replay.pop(0) self.env_interacts_memory.pop(0) return paths
def obtain_samples(self, dyn_model=None, itr=None, policy=None, rau=None, delta=0, constraint_fn=None, constraint_cost_fn=None, HCMPC_Activation=False, Constrained=False): cur_params = self.algo.policy.get_param_values() paths = parallel_sampler.sample_paths( policy_params=cur_params, max_samples=self.algo.batch_size, dyn_model=dyn_model, max_path_length=self.algo.max_path_length, scope=self.algo.scope, policy=policy, rau=rau, delta=delta, constraint_fn=constraint_fn, constraint_cost_fn=constraint_cost_fn, HCMPC_Activation=HCMPC_Activation, Constrained=Constrained, ) if self.algo.whole_paths: return paths else: paths_truncated = parallel_sampler.truncate_paths( paths, self.algo.batch_size) return paths_truncated
def obtain_samples(self, itr): cur_params = self.algo.policy.get_param_values() paths = parallel_sampler.sample_paths( policy_params=cur_params, max_samples=self.algo.batch_size, max_path_length=self.algo.max_path_length, scope=self.algo.scope, ) """log_likelihoods for importance sampling""" for path in paths: logli = self.algo.policy.distribution.log_likelihood(path["actions"],path["agent_infos"]) path["log_likelihood"] = logli """keep data use per iteration approximately fixed""" if not(self.algo.all_paths): paths = local_truncate_paths(paths, self.algo.batch_size) """keep track of path length""" self.env_interacts = sum([len(path["rewards"]) for path in paths]) self.total_env_interacts += self.env_interacts self.mean_path_len = float(self.env_interacts)/len(paths) """manage experience replay for old batch reuse""" self.experience_replay.append(paths) self.env_interacts_memory.append(self.env_interacts) if len(self.experience_replay) > self.algo.batch_aggregate_n: self.experience_replay.pop(0) self.env_interacts_memory.pop(0) return paths
def obtain_samples(self, itr): paths = parallel_sampler.sample_paths( policy_params=None, max_samples=self.algo.batch_size, max_path_length=self.algo.max_path_length, scope=self.algo.scope, ) return paths
def obtain_samples(self, itr, reset_args=None, return_dict=False, log_prefix=''): init_policy_params_list = cur_policy_params_list = [ policy.get_param_values() for policy in self.algo.policy_list ] if hasattr(self.algo.env, "get_param_values"): try: cur_env_params = self.algo.env.get_param_values() except: cur_env_params = None else: cur_env_params = None import time start = time.time() if type(reset_args) != list and type(reset_args) != np.ndarray: reset_args = [reset_args] * self.n_envs if self.algo.policy_list[0].all_param_vals is not None: cur_policy_params_list = [[ flatten_tensors(x.values()) for x in policy.all_param_vals ] for policy in self.algo.policy_list] else: cur_policy_params_list = [ [cur_policy_params] * self.n_envs for cur_policy_params in cur_policy_params_list ] # do tasks sequentially and parallelize within rollouts per task. paths = {} for n in range(len(self.algo.policy_list)): for i in range(self.n_envs): paths[str(n) + "_" + str(i)] = parallel_sampler.sample_paths( policy_params=cur_policy_params_list[n][i], env_params=cur_env_params, max_samples=self.algo.batch_size / self.n_envs, max_path_length=self.algo.max_path_length, scope=self.algo.scope, reset_arg=reset_args[i], show_prog_bar=False, ) total_time = time.time() - start logger.record_tabular(log_prefix + "TotalExecTime", total_time) if not return_dict: flatten_list = lambda l: [ item for sublist in l for item in sublist ] paths = flatten_list(paths.values()) for n in range(len(self.algo.policy_list)): self.algo.policy_list[n].set_param_values( init_policy_params_list[n]) # currently don't support not whole paths (if desired, add code to truncate paths) assert self.algo.whole_paths return paths
def evaluate(self, steps, pool): logger.log("Collecting samples for evaluation") paths = parallel_sampler.sample_paths( policy_params=self.policy.get_param_values(), max_samples=self.eval_samples, max_path_length=self.max_path_length, ) average_discounted_return = np.mean([ special.discount_return(path["rewards"], self.discount) for path in paths ]) returns = [sum(path["rewards"]) for path in paths] all_qs = np.concatenate(self.q_averages) all_ys = np.concatenate(self.y_averages) average_q_loss = np.mean(self.qf_loss_averages) average_policy_surr = np.mean(self.policy_surr_averages) average_action = np.mean( np.square(np.concatenate([path["actions"] for path in paths]))) policy_reg_param_norm = np.linalg.norm( self.policy.get_param_values(regularizable=True)) qfun_reg_param_norm = np.linalg.norm( self.qf.get_param_values(regularizable=True)) logger.record_tabular('steps', steps) logger.record_tabular('AverageReturn', np.mean(returns)) logger.record_tabular('StdReturn', np.std(returns)) logger.record_tabular('MaxReturn', np.max(returns)) logger.record_tabular('MinReturn', np.min(returns)) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageQLoss', average_q_loss) logger.record_tabular('AveragePolicySurr', average_policy_surr) logger.record_tabular('AverageQ', np.mean(all_qs)) logger.record_tabular('AverageAbsQ', np.mean(np.abs(all_qs))) logger.record_tabular('AverageY', np.mean(all_ys)) logger.record_tabular('AverageAbsY', np.mean(np.abs(all_ys))) logger.record_tabular('AverageAbsQYDiff', np.mean(np.abs(all_qs - all_ys))) logger.record_tabular('AverageAction', average_action) logger.record_tabular('PolicyRegParamNorm', policy_reg_param_norm) logger.record_tabular('QFunRegParamNorm', qfun_reg_param_norm) self.env.log_diagnostics(paths) self.policy.log_diagnostics(paths) self.qf_loss_averages = [] self.policy_surr_averages = [] self.q_averages = [] self.y_averages = [] self.es_path_returns = []
def obtain_samples(self, itr): cur_params = self.policy.get_param_values() paths = parallel_sampler.sample_paths( policy_params=cur_params, max_samples=self.batch_size, max_path_length=self.max_path_length, ) if self.whole_paths: return paths else: paths_truncated = parallel_sampler.truncate_paths(paths, self.batch_size) return paths_truncated
def obtain_samples(self, itr, env_params=None): all_paths = [] for action_sequence in self.action_sequences: paths = parallel_sampler.sample_paths( policy_params=action_sequence, max_samples=self.n_traj, max_path_length=action_sequence.shape[0], scope=self.scope, count_traj=True, terminate_only_max_path=True, env_params=env_params) # truncate the paths if we collected more than self.n_traj all_paths += paths[:self.n_traj] return all_paths
def obtain_samples(self, itr, reset_args=None, policy_contexts=None, return_dict=False): init_policy_params = cur_policy_params = self.algo.policy.get_param_values( ) if hasattr(self.algo.env, "get_param_values"): try: cur_env_params = self.algo.env.get_param_values() except: cur_env_params = None else: cur_env_params = None import time start = time.time() if type(reset_args) != list and type(reset_args) != np.ndarray: reset_args = [reset_args] * self.n_envs if type(policy_contexts) != list and type( policy_contexts) != np.ndarray: policy_contexts = [policy_contexts] * self.n_envs cur_policy_params = [cur_policy_params] * self.n_envs # do tasks sequentially and parallelize within rollouts per task. paths = {} for i in range(self.n_envs): paths[i] = parallel_sampler.sample_paths( policy_params=cur_policy_params[i], env_params=cur_env_params, max_samples=self.algo.batch_size / self.n_envs, max_path_length=self.algo.max_path_length, scope=self.algo.scope, reset_arg=reset_args[i], policy_context=policy_contexts[i], show_prog_bar=False, ) total_time = time.time() - start logger.record_tabular("TotalExecTime", total_time) if not return_dict: flatten_list = lambda l: [ item for sublist in l for item in sublist ] paths = flatten_list(paths.values()) self.algo.policy.set_param_values(init_policy_params) # currently don't support not whole paths (if desired, add code to truncate paths) assert self.algo.whole_paths return paths
def obtain_samples(self, itr, reset_args=None, return_dict=False, log_prefix='',extra_input=None,extra_input_dim=None, save_img_obs=False, preupdate=True): if extra_input is not None: assert False, "not implemented" if not preupdate: assert False, "not implemented" init_policy_params = cur_policy_params = self.algo.policy.get_param_values() if hasattr(self.algo.env,"get_param_values"): try: cur_env_params = self.algo.env.get_param_values() except: cur_env_params = None else: cur_env_params = None import time start = time.time() if type(reset_args) != list and type(reset_args)!=np.ndarray: reset_args = [reset_args]*self.n_envs if hasattr(self.algo.policy, 'all_param_vals'): #TODO: RK, need to make this less hacky and still work with non-maml policies if self.algo.policy.all_param_vals: cur_policy_params = [flatten_tensors(x.values()) for x in self.algo.policy.all_param_vals] else: cur_policy_params = [cur_policy_params]*self.n_envs else: cur_policy_params = [cur_policy_params]*self.n_envs # do tasks sequentially and parallelize within rollouts per task. paths = {} for i in range(self.n_envs): paths[i] = parallel_sampler.sample_paths( policy_params=cur_policy_params[i], env_params=cur_env_params, max_samples=self.algo.batch_size / self.n_envs, max_path_length=self.algo.max_path_length, scope=self.algo.scope, reset_arg=reset_args[i], show_prog_bar=False, ) total_time = time.time() - start logger.record_tabular(log_prefix+"TotalExecTime", total_time) if not return_dict: flatten_list = lambda l: [item for sublist in l for item in sublist] paths = flatten_list(paths.values()) self.algo.policy.set_param_values(init_policy_params) # currently don't support not whole paths (if desired, add code to truncate paths) assert self.algo.whole_paths return paths
def evaluate(self, epoch, pool): logger.log('Collecting samples for evaluation') paths = parallel_sampler.sample_paths( policy_params=self.policy.get_param_values(), max_samples=self.eval_max_samples, max_path_length=self.eval_max_path_length, ) average_discounted_return = np.mean( [special.discount_return(path['rewards'], self.discount) for path in paths] ) returns = [sum(path['rewards']) for path in paths] logger.record_tabular('Epoch', epoch) logger.record_tabular('AverageReturn', np.mean(returns)) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('StdReturn', np.std(returns)) logger.record_tabular('MaxReturn', np.max(returns)) logger.record_tabular('MinReturn', np.min(returns)) if len(self.es_path_returns) > 0: logger.record_tabular('AverageEsReturn', np.mean(self.es_path_returns)) logger.record_tabular('StdEsReturn', np.std(self.es_path_returns)) logger.record_tabular('MaxEsReturn', np.max(self.es_path_returns)) logger.record_tabular('MinEsReturn', np.min(self.es_path_returns)) logger.record_tabular('AverageEsPathLength', np.mean(self.es_path_length)) logger.record_tabular('AverageQLoss', np.mean(self.qf_loss_averages)) all_qs = np.concatenate(self.qs_averages) all_ys = np.concatenate(self.ys_averages) logger.record_tabular('AverageQ', np.mean(all_qs)) logger.record_tabular('AverageAbsQ', np.mean(np.abs(all_qs))) logger.record_tabular('AverageY', np.mean(all_ys)) logger.record_tabular('AverageAbsY', np.mean(np.abs(all_ys))) self.qf_loss_averages = [] self.qs_averages = [] self.ys_averages = [] self.es_path_length = [] self.es_path_returns = []
def obtain_samples(self, itr): cur_params = self.algo.policy.get_param_values() paths = parallel_sampler.sample_paths( policy_params=cur_params, max_samples=self.algo.batch_size, include_original_frames=True, max_path_length=self.algo.max_path_length, scope=self.algo.scope, ) if self.algo.whole_paths: return paths else: paths_truncated = parallel_sampler.truncate_paths( paths, self.algo.batch_size) return paths_truncated
def obtain_samples(self, itr): cur_params = self.algo.policy.get_param_values() paths = parallel_sampler.sample_paths( policy_params=cur_params, # TODO - can I just pass in new parameters here? (the updated ones?) max_samples=self.algo.batch_size, max_path_length=self.algo.max_path_length, scope=self.algo.scope, ) # TODO - does the optimizer assume that the paths came from a policy with params cur_params? # Or can I just pass in cur_params - alpha*grads? if self.algo.whole_paths: return paths else: paths_truncated = parallel_sampler.truncate_paths(paths, self.algo.batch_size) return paths_truncated
def obtain_samples(self, itr): cur_pro_params = self.algo.pro_policy.get_param_values() cur_adv_params = self.algo.adv_policy.get_param_values() paths = parallel_sampler.sample_paths( pro_policy_params=cur_pro_params, max_samples=self.algo.batch_size, max_path_length=self.algo.max_path_length, scope=self.algo.scope, adv_policy_params=cur_adv_params ) if self.algo.whole_paths: return paths else: paths_truncated = parallel_sampler.truncate_paths(paths, self.algo.batch_size) return paths_truncated
def obtain_samples(self, itr, determ=False): cur_policy_params = self.algo.policy.get_param_values() cur_env_params = self.algo.env.get_param_values() paths = parallel_sampler.sample_paths( policy_params=cur_policy_params, env_params=cur_env_params, max_samples=self.algo.batch_size, max_path_length=self.algo.max_path_length, scope=self.algo.scope, ) if self.algo.whole_paths: return paths else: paths_truncated = parallel_sampler.truncate_paths( paths, self.algo.batch_size) return paths_truncated
def train(self): parallel_sampler.populate_task(self.env, self.policy, self.scope) obs = self.env.reset() for i in range(10): logger.log("Epoch % d" % i) for _ in range(100): action, _ = self.policy.get_action(obs) next_obs, rew, done, info = self.env.step(action) obs = next_obs if not done else self.env.reset() logger.log("Evaluating...") paths = parallel_sampler.sample_paths( policy_params=self.policy.get_param_values(), max_samples=20, max_path_length=100, ) parallel_sampler.terminate_task()
def obtain_samples(self, itr): if config.TF_NN_SETTRACE: ipdb.set_trace() cur_policy_params = self.algo.policy.get_param_values() paths = parallel_sampler.sample_paths( policy_params=cur_policy_params, env_params=None, max_samples=self.algo.batch_size, max_path_length=self.algo.max_path_length, scope=self.algo.scope, ) if self.algo.whole_paths: return paths else: paths_truncated = parallel_sampler.truncate_paths( paths, self.algo.batch_size) return paths_truncated
def obtain_samples(self, itr): if hasattr(self.algo.policy, 'get_param_values_with_baseline'): cur_params = self.algo.policy.get_param_values_with_baseline() else: cur_params = self.algo.policy.get_param_values() paths = parallel_sampler.sample_paths( policy_params=cur_params, max_samples=self.algo.batch_size, max_path_length=self.algo.max_path_length, scope=self.algo.scope, ) if self.algo.whole_paths: return paths else: paths_truncated = parallel_sampler.truncate_paths( paths, self.algo.batch_size) return paths_truncated
def obtain_samples(self, itr, env_params=None): try: cur_params = self.policy.get_param_values() except AttributeError: cur_params = None paths = parallel_sampler.sample_paths( policy_params=cur_params, max_samples=self.n_traj, max_path_length=self.max_path_length, scope=self.scope, useImitationEnv=self.useImitationEnv, useImitationPolicy=self.useImitationPolicy, count_traj=True, terminate_only_max_path=self.terminate_only_max_path, env_params=env_params) # truncate the paths if we collected more than self.n_traj return paths[:self.n_traj]
def obtain_samples(self, itr, include_joint_coords=False): # TODO: include_joint_coords not supported for BatchSampler yet. cur_policy_params = self.algo.policy.get_param_values() cur_env_params = self.algo.env.get_param_values() paths = parallel_sampler.sample_paths( policy_params=cur_policy_params, env_params=cur_env_params, max_samples=self.algo.batch_size, max_path_length=self.algo.max_path_length, scope=self.algo.scope, ) if self.algo.whole_paths: return paths else: paths_truncated = parallel_sampler.truncate_paths( paths, self.algo.batch_size) return paths_truncated
def obtain_samples(self, itr): cur_policy_params = self.algo.policy.get_param_values() if hasattr(self.algo.env,"get_param_values"): cur_env_params = self.algo.env.get_param_values() else: cur_env_params = None paths = parallel_sampler.sample_paths( policy_params=cur_policy_params, env_params=cur_env_params, max_samples=self.algo.batch_size, max_path_length=self.algo.max_path_length, scope=self.algo.scope, ) if self.algo.whole_paths: return paths else: paths_truncated = parallel_sampler.truncate_paths(paths, self.algo.batch_size) return paths_truncated
def obtain_samples(self, itr): cur_policy_params = self.algo.policy.get_param_values() #if hasattr(self.algo.env,"get_param_values"): #cur_env_params = self.algo.env.get_param_values() #else: #cur_env_params = None paths = parallel_sampler.sample_paths( policy_params=cur_policy_params, env_params=None, max_samples=self.algo.batch_size, max_path_length=self.algo.max_path_length, scope=self.algo.scope, ) if self.algo.whole_paths: return paths else: paths_truncated = parallel_sampler.truncate_paths(paths, self.algo.batch_size) return paths_truncated
def obtain_samples(self, itr, reset_args=None, return_dict=False, log_prefix=''): init_policy_params = cur_policy_params = self.algo.policy.get_param_values() if hasattr(self.algo.env,"get_param_values"): try: cur_env_params = self.algo.env.get_param_values() except: cur_env_params = None else: cur_env_params = None import time start = time.time() if type(reset_args) != list and type(reset_args)!=np.ndarray: reset_args = [reset_args]*self.n_envs if self.algo.policy.all_param_vals: cur_policy_params = [flatten_tensors(x.values()) for x in self.algo.policy.all_param_vals] else: cur_policy_params = [cur_policy_params]*self.n_envs # do tasks sequentially and parallelize within rollouts per task. paths = {} for i in range(self.n_envs): paths[i] = parallel_sampler.sample_paths( policy_params=cur_policy_params[i], env_params=cur_env_params, max_samples=self.algo.batch_size / self.n_envs, max_path_length=self.algo.max_path_length, scope=self.algo.scope, reset_arg=reset_args[i], show_prog_bar=False, ) total_time = time.time() - start logger.record_tabular(log_prefix+"TotalExecTime", total_time) if not return_dict: flatten_list = lambda l: [item for sublist in l for item in sublist] paths = flatten_list(paths.values()) self.algo.policy.set_param_values(init_policy_params) # currently don't support not whole paths (if desired, add code to truncate paths) assert self.algo.whole_paths return paths
def obtain_samples(self, itr, target_task=None): cur_params = self.algo.policy.get_param_values() paths = parallel_sampler.sample_paths( policy_params=cur_params, max_samples=self.algo.batch_size, max_path_length=self.algo.max_path_length, scope=self.algo.scope, iter=itr, policy=self.algo.policy, env=self.algo.env, baseline=self.algo.baseline, target_task=target_task, ) if self.algo.whole_paths: return paths else: paths_truncated = parallel_sampler.truncate_paths( paths, self.algo.batch_size) return paths_truncated
def evaluate(self, epoch, opt_info): logger.log('Collecting samples for evaluation') paths = parallel_sampler.sample_paths( policy_params=opt_info['target_policy'], max_samples=self.eval_max_samples, max_path_length=self.eval_max_path_length, ) average_discounted_return = np.mean([ special.discount_return(path['rewards'], self.discount) for path in paths ]) returns = [sum(path['rewards']) for path in paths] logger.record_tabular('Epoch', epoch) logger.record_tabular('AverageReturn', np.mean(returns)) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('StdReturn', np.std(returns)) logger.record_tabular('MaxReturn', np.max(returns)) logger.record_tabular('MinReturn', np.min(returns))
def obtain_samples(self, itr): cur_params = self.algo.policy.get_param_values() raw_paths = parallel_sampler.sample_paths( policy_params=cur_params, max_samples=self.algo.batch_size, max_path_length=self.algo.max_path_length, scope=self.algo.scope, ) if self.period is None: # hippo random p paths = raw_paths else: #todo: this will break for environments where the rollout terminates after goal is reached paths = [] for path in raw_paths: new_length = (len(path['rewards']) // self.period) * self.period for key in path.keys(): if isinstance(path[key], dict): for key2 in path[key].keys(): path[key][key2] = path[key][key2][:new_length] else: path[key] = path[key][:new_length] if len(path['rewards']) > 0: paths.append(path) # num_padding = self.period - (len(path['rewards']) % self.period) # for key in path.keys(): # if isinstance(path[key], dict): # for key2 in path[key].keys(): # path[key][key2]. # paths = raw_paths if self.algo.whole_paths: return paths else: paths_truncated = parallel_sampler.truncate_paths( paths, self.algo.batch_size) return paths_truncated
def obtain_samples(self, itr, reset_args=None, return_dict=False, log_prefix='', extra_input=None, extra_input_dim=None, save_img_obs=False, preupdate=True, numTrajs_perTask=None): # if not preupdate: # assert False, "not implemented" init_policy_params = cur_policy_params = self.algo.policy.get_param_values( ) if hasattr(self.algo.env, "get_param_values"): try: cur_env_params = self.algo.env.get_param_values() except: cur_env_params = None else: cur_env_params = None import time start = time.time() if type(reset_args) != list and type(reset_args) != np.ndarray: reset_args = [reset_args] * self.n_envs cur_policy_params = [cur_policy_params] * self.n_envs # do tasks sequentially and parallelize within rollouts per task. paths = {} all_param_vals_list = self.algo.policy.all_param_vals if extra_input == None: extra_infos = None else: assert extra_input in [ "onehot_exploration", 'gaussian_exploration', 'onehot_hacked' ] extra_infos = [extra_input, extra_input_dim, preupdate] for i in range(self.n_envs): if self.algo.policy.all_param_vals is None: policy_params = cur_policy_params[i] else: policy_params = flatten_tensors( all_param_vals_list[i].values()) paths_i = parallel_sampler.sample_paths( policy_params=policy_params, env_params=cur_env_params, max_samples=self.algo.batch_size / self.n_envs, max_path_length=self.algo.max_path_length, scope=self.algo.scope, reset_arg=reset_args[i], taskIdx=i, show_prog_bar=False, extra_infos=extra_infos) if numTrajs_perTask != None: paths[i] = paths_i[:numTrajs_perTask] else: paths[i] = paths_i total_time = time.time() - start logger.record_tabular(log_prefix + "TotalExecTime", total_time) if not return_dict: flatten_list = lambda l: [ item for sublist in l for item in sublist ] paths = flatten_list(paths.values()) #self.algo.policy.set_param_values(init_policy_params) # currently don't support not whole paths (if desired, add code to truncate paths) assert self.algo.whole_paths return paths
def evaluate(self, epoch, pool): logger.log("Collecting samples for evaluation") paths = parallel_sampler.sample_paths( policy_params=self.policy.get_param_values(), max_samples=self.eval_samples, max_path_length=self.max_path_length, ) average_discounted_return = np.mean( [special.discount_return(path["rewards"], self.discount) for path in paths] ) returns = [sum(path["rewards"]) for path in paths] for path in paths: path["safety_rewards"] = self.safety_constraint.evaluate(path) * self.env.bomb_cost costs = [sum(path["safety_rewards"]) for path in paths] all_qs = np.concatenate(self.q_averages) all_qs_cost = np.concatenate(self.q_cost_averages) all_ys = np.concatenate(self.y_averages) all_zs = np.concatenate(self.z_averages) average_q_loss = np.mean(self.qf_loss_averages) average_q_cost_loss = np.mean(self.qf_cost_loss_averages) average_policy_surr = np.mean(self.policy_surr_averages) average_action = np.mean(np.square(np.concatenate( [path["actions"] for path in paths] ))) policy_reg_param_norm = np.linalg.norm( self.policy.get_param_values(regularizable=True) ) qfun_reg_param_norm = np.linalg.norm( self.qf.get_param_values(regularizable=True) ) logger.record_tabular('Epoch', epoch) logger.record_tabular('AverageReturn', np.mean(returns)) logger.record_tabular('StdReturn', np.std(returns)) logger.record_tabular('MaxReturn', np.max(returns)) logger.record_tabular('MinReturn', np.min(returns)) if len(self.es_path_returns) > 0: logger.record_tabular('AverageEsReturn', np.mean(self.es_path_returns)) logger.record_tabular('StdEsReturn', np.std(self.es_path_returns)) logger.record_tabular('MaxEsReturn', np.max(self.es_path_returns)) logger.record_tabular('MinEsReturn', np.min(self.es_path_returns)) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageQLoss', average_q_loss) logger.record_tabular('AveragePolicySurr', average_policy_surr) logger.record_tabular('EstimatedQ', np.mean(all_qs)) #logger.record_tabular('AverageAbsQ', np.mean(np.abs(all_qs))) #logger.record_tabular('AverageY', np.mean(all_ys)) #logger.record_tabular('AverageAbsY', np.mean(np.abs(all_ys))) #logger.record_tabular('AverageAbsQYDiff', # np.mean(np.abs(all_qs - all_ys))) #logger.record_tabular('AverageAction', average_action) #logger.record_tabular('PolicyRegParamNorm', # policy_reg_param_norm) #logger.record_tabular('QFunRegParamNorm', # qfun_reg_param_norm) logger.record_tabular('EstimatedQcost', np.mean(all_qs_cost)) #logger.record_tabular('AverageZ', np.mean(all_zs)) logger.record_tabular('AverageQcostLoss', average_q_cost_loss) logger.record_tabular('AverageCosts', np.mean(costs)) logger.record_tabular('DualVariable', self.dual_var) logger.record_tabular('AvgDual', np.mean(self.dual_history[::200])) self.env.log_diagnostics(paths) self.policy.log_diagnostics(paths) print(self.dual_history[::200]) f = open("/home/qingkai/ddpg_performance.csv", 'a') writer = csv.writer(f, delimiter=',') writer.writerow((epoch, np.mean(returns), np.mean(costs), self.dual_var, np.mean(all_qs), np.mean(all_qs_cost), self.avg_dual)) f.close() self.qf_loss_averages = [] self.qf_cost_loss_averages = [] self.policy_surr_averages = [] self.q_averages = [] self.q_cost_averages = [] self.y_averages = [] self.z_averages = [] self.es_path_returns = []
def evaluate(self, epoch, pool): logger.log("Collecting samples for evaluation") paths = parallel_sampler.sample_paths( policy_params=self.exec_policy.get_param_values(), max_samples=self.eval_samples, max_path_length=self.max_path_length, ) average_discounted_return = np.mean( [special.discount_return(path["rewards"], self.discount) for path in paths] ) returns = [sum(path["rewards"]) for path in paths] average_action = np.mean(np.square(np.concatenate( [path["actions"] for path in paths] ))) qfun_reg_param_norm = np.linalg.norm( self.qf.get_param_values(regularizable=True) ) logger.record_tabular('Epoch', epoch) logger.record_tabular('Iteration', epoch) logger.record_tabular('AverageReturn', np.mean(returns)) logger.record_tabular('StdReturn', np.std(returns)) logger.record_tabular('MaxReturn', np.max(returns)) logger.record_tabular('MinReturn', np.min(returns)) if len(self.es_path_returns) > 0: logger.record_tabular('AverageEsReturn', np.mean(self.es_path_returns)) logger.record_tabular('StdEsReturn', np.std(self.es_path_returns)) logger.record_tabular('MaxEsReturn', np.max(self.es_path_returns)) logger.record_tabular('MinEsReturn', np.min(self.es_path_returns)) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageAction', average_action) logger.record_tabular('QFunRegParamNorm', qfun_reg_param_norm) self.env.log_diagnostics(paths) self.log_critic_training() self.es_path_returns = [] if not self.qf_dqn: average_policy_surr = np.mean(self.policy_surr_averages) policy_reg_param_norm = np.linalg.norm( self.policy.get_param_values(regularizable=True) ) logger.record_tabular('AveragePolicySurr', average_policy_surr) logger.record_tabular('PolicyRegParamNorm', policy_reg_param_norm) self.policy.log_diagnostics(paths) self.policy_surr_averages = []
def evaluate(self, epoch, pool): logger.log("Collecting samples for evaluation") paths = parallel_sampler.sample_paths( policy_params=self.policy.get_param_values(), max_samples=self.eval_samples, max_path_length=self.max_path_length, ) average_discounted_return = np.mean( [special.discount_return(path["rewards"], self.discount) for path in paths] ) returns = [sum(path["rewards"]) for path in paths] all_qs = np.concatenate(self.q_averages) all_ys = np.concatenate(self.y_averages) average_q_loss = np.mean(self.qf_loss_averages) average_policy_surr = np.mean(self.policy_surr_averages) average_action = np.mean(np.square(np.concatenate( [path["actions"] for path in paths] ))) policy_reg_param_norm = np.linalg.norm( self.policy.get_param_values(regularizable=True) ) qfun_reg_param_norm = np.linalg.norm( self.qf.get_param_values(regularizable=True) ) logger.record_tabular('Epoch', epoch) logger.record_tabular('AverageReturn', np.mean(returns)) logger.record_tabular('StdReturn', np.std(returns)) logger.record_tabular('MaxReturn', np.max(returns)) logger.record_tabular('MinReturn', np.min(returns)) if len(self.es_path_returns) > 0: logger.record_tabular('AverageEsReturn', np.mean(self.es_path_returns)) logger.record_tabular('StdEsReturn', np.std(self.es_path_returns)) logger.record_tabular('MaxEsReturn', np.max(self.es_path_returns)) logger.record_tabular('MinEsReturn', np.min(self.es_path_returns)) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageQLoss', average_q_loss) logger.record_tabular('AveragePolicySurr', average_policy_surr) logger.record_tabular('AverageQ', np.mean(all_qs)) logger.record_tabular('AverageAbsQ', np.mean(np.abs(all_qs))) logger.record_tabular('AverageY', np.mean(all_ys)) logger.record_tabular('AverageAbsY', np.mean(np.abs(all_ys))) logger.record_tabular('AverageAbsQYDiff', np.mean(np.abs(all_qs - all_ys))) logger.record_tabular('AverageAction', average_action) logger.record_tabular('PolicyRegParamNorm', policy_reg_param_norm) logger.record_tabular('QFunRegParamNorm', qfun_reg_param_norm) self.env.log_diagnostics(paths) self.policy.log_diagnostics(paths) self.qf_loss_averages = [] self.policy_surr_averages = [] self.q_averages = [] self.y_averages = [] self.es_path_returns = []