def _log_infos(self, traj_infos=None): if traj_infos is None: traj_infos = self._traj_infos if traj_infos: for k in traj_infos[0]: if not k.startswith("_"): logger.record_tabular_misc_stat( k, [info[k] for info in traj_infos]) if self._opt_infos: for k, v in self._opt_infos.items(): logger.record_tabular_misc_stat(k, v) self._opt_infos = {k: list() for k in self._opt_infos} # (reset) if self._layerwise_stats: for name, param, init_val in zip(self._param_names, self._params, self._init_params_values): new_val = param.get_value() diff = new_val - init_val logger.record_tabular(name + "_Norm", np.sqrt(np.sum(new_val**2))) logger.record_tabular(name + "_NormFromInit", np.sqrt(np.sum(diff**2))) new_param_vector = self.policy.get_param_values() logger.record_tabular("ParamsNorm", np.sqrt(np.sum(new_param_vector**2))) params_diff = new_param_vector - self._initial_param_vector logger.record_tabular("NormFromInit", np.sqrt(np.sum(params_diff**2)))
def log_diagnostics(self, paths, log_prefix='Gather', *args, **kwargs): # we call here any logging related to the gather, strip the maze obs and call log_diag with the stripped paths # we need to log the purely gather reward!! with logger.tabular_prefix(log_prefix + '_'): gather_undiscounted_returns = [ sum(path['env_infos']['outer_rew']) for path in paths ] logger.record_tabular_misc_stat('Return', gather_undiscounted_returns, placement='front') stripped_paths = [] for path in paths: stripped_path = {} for k, v in path.items(): stripped_path[k] = v stripped_path['observations'] = \ stripped_path['observations'][:, :self.wrapped_env.observation_space.flat_dim] # this breaks if the obs of the robot are d>1 dimensional (not a vector) stripped_paths.append(stripped_path) with logger.tabular_prefix('wrapped_'): if 'env_infos' in paths[0].keys( ) and 'inner_rew' in paths[0]['env_infos'].keys(): wrapped_undiscounted_return = np.mean( [np.sum(path['env_infos']['inner_rew']) for path in paths]) logger.record_tabular('AverageReturn', wrapped_undiscounted_return) self.wrapped_env.log_diagnostics( stripped_paths ) # see swimmer_env.py for a scketch of the maze plotting!
def log_diagnostics(self, paths, prefix=''): progs = [ np.linalg.norm(path["env_infos"]["com"][-1] - path["env_infos"]["com"][0]) for path in paths ] logger.record_tabular_misc_stat('Progress', progs) self.plot_visitations(paths, visit_prefix=prefix)
def log_diagnostics(self, paths): progs = [ path["observations"][-1][-3] - path["observations"][0][-3] for path in paths ] logger.record_tabular_misc_stat('Progress', progs, 'front') largest_positive_prog = max(0, np.max(progs)) largest_negative_prog = min(0, np.min(progs)) if abs(largest_negative_prog) > 10e-8 and abs( largest_positive_prog) > 10e-8: bimod_ratio = min( abs(largest_negative_prog / largest_positive_prog), abs(largest_positive_prog / largest_negative_prog)) else: bimod_ratio = 0 logger.record_tabular('BimodalityProgress', bimod_ratio)
def log_diagnostics(self, paths, log_prefix='Gather', *args, **kwargs): # we call here any logging related to the gather, strip the maze obs and call log_diag with the stripped paths # we need to log the purely gather reward!! with logger.tabular_prefix(log_prefix + '_'): gather_undiscounted_returns = [sum(path['env_infos']['outer_rew']) for path in paths] logger.record_tabular_misc_stat('Return', gather_undiscounted_returns, placement='front') stripped_paths = [] for path in paths: stripped_path = {} for k, v in path.items(): stripped_path[k] = v stripped_path['observations'] = \ stripped_path['observations'][:, :self.wrapped_env.observation_space.flat_dim] # this breaks if the obs of the robot are d>1 dimensional (not a vector) stripped_paths.append(stripped_path) with logger.tabular_prefix('wrapped_'): if 'env_infos' in paths[0].keys() and 'inner_rew' in paths[0]['env_infos'].keys(): wrapped_undiscounted_return = np.mean([np.sum(path['env_infos']['inner_rew']) for path in paths]) logger.record_tabular('AverageReturn', wrapped_undiscounted_return) self.wrapped_env.log_diagnostics(stripped_paths) # see swimmer_env.py for a scketch of the maze plotting!
def log_diagnostics(self, paths, *args, **kwargs): # we call here any logging related to the maze, strip the maze obs and call log_diag with the stripped paths # we need to log the purely gather reward!! with logger.tabular_prefix('Maze_'): gather_undiscounted_returns = [ sum(path['env_infos']['outer_rew']) for path in paths ] logger.record_tabular_misc_stat('Return', gather_undiscounted_returns, placement='front') stripped_paths = [] for path in paths: stripped_path = {} for k, v in path.items(): # print("k", k) stripped_path[k] = v # for k, v in path["agent_infos"].items(): # print("k", k) # print("latents", stripped_path["agent_infos"]["latents"]) # print("latents", stripped_path["agent_infos"]["latents"].shape) # print("shape_len", len(stripped_path['observations'].shape)) # print("after_con", np.concatenate(stripped_path['observations']).shape) if len(stripped_path['observations'].shape) == 1: stripped_path['observations'] = np.concatenate( stripped_path['observations']) stripped_path['observations'] = \ stripped_path['observations'][:, :self.wrapped_env.observation_space.flat_dim] # this breaks if the obs of the robot are d>1 dimensional (not a vector) stripped_paths.append(stripped_path) with logger.tabular_prefix('wrapped_'): wrapped_undiscounted_return = np.mean( [np.sum(path['env_infos']['inner_rew']) for path in paths]) # for _ in range(10): # print('OK!') # print(wrapped_undiscounted_return) # print([np.sum(path['env_infos']['inner_rew']) for path in paths]) logger.record_tabular('SuccessRate', wrapped_undiscounted_return) self.wrapped_env.log_diagnostics(stripped_paths, *args, **kwargs)
def process_samples(self, itr, paths): baselines = [] returns = [] if len(paths) > 0 and "vf" in paths[0]["agent_infos"]: all_path_baselines = [ p["agent_infos"]["vf"].flatten() for p in paths ] else: if hasattr(self.algo.baseline, "predict_n"): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [ self.algo.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list( [path["returns"] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate( [path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.asarray( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path["returns"] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.sum( self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) logger.log("fitting baseline...") if hasattr(self.algo.baseline, 'fit_with_samples'): self.algo.baseline.fit_with_samples(paths, samples_data) else: self.algo.baseline.fit(paths) logger.log("fitted") logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular_misc_stat('TrajLen', [len(p["rewards"]) for p in paths], placement='front') logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular_misc_stat('Return', undiscounted_returns, placement='front') return samples_data
def process_samples(self, itr, paths): baselines = [] returns = [] n = len(paths[0]["rewards"]) for i in range(n): baselines.append([]) returns.append([]) if len(paths) > 0 and "vf" in paths[0]["agent_infos"]: all_path_baselines = [ p["agent_infos"]["vf"].flatten() for p in paths ] else: if hasattr(self.algo.baseline, "predict_n"): raise NotImplementedError else: all_path_baselines = [[ self.algo.NPOs[i].baseline.predict(path, idx=i) for i in range(n) ] for path in paths] for idx, path in enumerate(paths): path["advantages"] = [] path["returns"] = [] for i in range(n): path_baselines = np.append(all_path_baselines[idx][i], 0) deltas = path["rewards"][i] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"].append( special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda)) path["returns"].append( special.discount_cumsum(path["rewards"][i], self.algo.discount)) baselines[i].append(path_baselines[:-1]) returns[i].append(path["returns"][i]) ev = [ special.explained_variance_1d(np.concatenate(baselines[i]), np.concatenate(returns[i])) for i in range(n) ] if not self.algo.policy.recurrent: tensor_concat = lambda key: [ tensor_utils.concat_tensor_list(x) for x in regroup([path[key] for path in paths]) ] tensor_concat_d = lambda key: [ tensor_utils.concat_tensor_dict_list(x) for x in regroup([path[key] for path in paths]) ] observations_n = tensor_concat("observations") actions_n = tensor_concat("actions") rewards_n = tensor_concat("rewards") returns_n = tensor_concat("returns") advantages_n = tensor_concat("advantages") # env_infos_n = tensor_concat_d("env_infos") agent_infos_n = tensor_concat_d("agent_infos") # TODO(cathywu) make consistent with the rest (above)? # env_infos = tensor_utils.concat_tensor_dict_list([path["env_infos"] for path in paths]) if self.algo.center_adv: advantages_n = [ util.center_advantages(advantages) for advantages in advantages_n ] if self.algo.positive_adv: advantages_n = [ util.shift_advantages_to_positive(advantages) for advantages in advantages_n ] average_discounted_return = \ np.mean([sum(path["returns"][i][0] for i in range(n)) for path in paths]) undiscounted_returns = [ sum(sum(path["rewards"])) for path in paths ] ent = np.mean( self.algo.policy.get_distribution(idx=0).entropy( agent_infos_n[0])) samples_data_n = [ dict( observations=observations_n[i], actions=actions_n[i], rewards=rewards_n[i], returns=returns_n[i], advantages=advantages_n[i], # env_infos=env_infos, agent_infos=agent_infos_n[i], # paths=paths, ) for i in range(n) ] else: return NotImplementedError logger.log("fitting baseline...") if hasattr(self.algo.baseline, 'fit_with_samples'): raise NotImplementedError else: for idx in range(len(self.algo.NPOs)): self.algo.NPOs[idx].baseline.fit(paths, idx=idx) logger.log("fitted") logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) for i in range(len(ev)): logger.record_tabular('ExplainedVariance-k%d' % i, ev[i]) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular_misc_stat('TrajLen', [len(p["rewards"][0]) for p in paths], placement='front') logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular_misc_stat('Return', undiscounted_returns, placement='front') return samples_data_n