def process_samples(self, itr, paths): print("process paths in npo_snn_rewards") # count visitations or whatever the bonus wants to do. This should not modify the paths for b_eval in self.bonus_evaluator: logger.log("fitting bonus evaluator before processing...") b_eval.fit_before_process_samples(paths) logger.log("fitted") # save real undiscounted reward before changing them undiscounted_returns = [sum(path["rewards"]) for path in paths] logger.record_tabular('TrueAverageReturn', np.mean(undiscounted_returns)) # logger.record_tabular('Episodic_reward', # undiscounted_returns) # print("paths_len", len(paths)) for path in paths: path['true_rewards'] = list(path['rewards']) # If using a latent regressor (and possibly adding MI to the reward): if isinstance(self.latent_regressor, Latent_regressor): with logger.prefix(' Latent_regressor '): self.latent_regressor.fit(paths) if self.reward_regressor_mi: for i, path in enumerate(paths): path['logli_latent_regressor'] = self.latent_regressor.predict_log_likelihood( [path], [path['agent_infos']['latents']])[0] # this is for paths usually.. path['rewards'] += self.reward_regressor_mi * path[ 'logli_latent_regressor'] # the logli of the latent is the variable of the mutual information # for the extra bonus for b, b_eval in enumerate(self.bonus_evaluator): for i, path in enumerate(paths): bonuses = b_eval.predict(path) # if i == 0: # print("path", path['actions']) # print("bonus", bonuses.shape) # print("reward", path['rewards'].shape) path['rewards'] += self.reward_coef_bonus[b] * bonuses real_samples = ext.extract_dict( BatchSampler.process_samples(self, itr, paths), # I don't need to process the hallucinated samples: the R, A,.. same! "observations", "actions", "advantages", "env_infos", "agent_infos" ) real_samples["importance_weights"] = np.ones_like(real_samples["advantages"]) # print("real_samples", real_samples) return real_samples
def process_path(paths, itr, low_sampler): paths_low = [] for idx, path in enumerate(paths): obs_shape = path['env_infos']["full_path"]["observations"].shape[2] # print("obs_shape", path['env_infos']["full_path"]["observations"].shape) act_shape = path['env_infos']["full_path"]["actions"].shape[2] path_low = dict( observations=path['env_infos']["full_path"]["observations"].reshape([-1, obs_shape]), actions=path['env_infos']["full_path"]["actions"].reshape([-1, act_shape]), rewards=path['env_infos']["full_path"]["rewards"].reshape([-1]), ) agent_info_low = dict() # print("obs_shape", path_low["observations"].shape) # print("act_shape", path_low["actions"].shape) # print("reward_shape", path_low["rewards"].shape) for key in path['env_infos']["full_path"]['agent_infos']: # print(key) new_shape = path['env_infos']["full_path"]["agent_infos"][key].shape[2] agent_info_low[key] = path['env_infos']["full_path"]['agent_infos'][key].reshape([-1, new_shape]) # print(key, agent_info_low[key].shape) path_low["agent_infos"] = agent_info_low env_info_low = dict() for key in path['env_infos']["full_path"]['env_infos']: # print(key, path) if key == 'com': new_shape = path['env_infos']["full_path"]["env_infos"][key].shape[2] env_info_low[key] = path['env_infos']["full_path"]['env_infos'][key].reshape( [-1, new_shape]) else: env_info_low[key] = path['env_infos']["full_path"]['env_infos'][key].reshape( [-1]) path_low["env_infos"] = env_info_low paths_low.append(path_low) real_samples = ext.extract_dict( low_sampler.process_samples(itr, paths_low), # I don't need to process the hallucinated samples: the R, A,.. same! "observations", "actions", "advantages", "env_infos", "agent_infos" ) real_samples["importance_weights"] = np.ones_like(real_samples["advantages"]) return real_samples
def train(self): self.start_worker() self.init_opt() # init_opt for low policy if self.train_low: self.init_opt_low() high_times = 0 obs_concat = adv_concat = lat_concat = pro_concat = act_concat = np.array( []) start_i = 0 #for itr in range(self.current_itr, self.n_itr): for itr in range(start_i, self.n_itr): gc.collect() # force freeing memory if self.transfer and itr == start_i: self.warm_start() with logger.prefix('itr #%d | ' % itr): if self.step_anneal: self.anneal_step_num(itr) # update the step length paths = self.sampler.obtain_samples(itr) self.discount = self.discount_high # change discount every time we train high-level policy! samples_data = self.sampler.process_samples(itr, paths) self.log_diagnostics(paths) if self.train_high == True: # train the high level policy if self.train_high_every and self.train_high_every != 1: # train high every is the time we train the low level per train of high level if high_times < self.train_high_every: if high_times == 0: # initialize concat vars obs_concat, act_concat, adv_concat = ext.extract( samples_data, "observations", "actions", "advantages") pro_concat = samples_data["agent_infos"][ 'prob'] lat_concat = samples_data["agent_infos"][ 'latents'] else: ## below: how should samples be concatenated: obs_tmp, act_tmp, adv_tmp = ext.extract( samples_data, "observations", "actions", "advantages") pro_tmp = samples_data["agent_infos"]['prob'] lat_tmp = samples_data["agent_infos"][ 'latents'] obs_concat = np.concatenate( (obs_concat, obs_tmp), axis=0) act_concat = np.concatenate( (act_concat, act_tmp), axis=0) adv_concat = np.concatenate( (adv_concat, adv_tmp), axis=0) pro_concat = np.concatenate( (pro_concat, pro_tmp), axis=0) lat_concat = np.concatenate( (lat_concat, lat_tmp), axis=0) ## above: how should samples be concatenated if high_times == self.train_high_every: high_times = 0 samples_data_concatenated = { 'observations': obs_concat, 'actions': act_concat, 'advantages': adv_concat, 'agent_infos': { 'prob': pro_concat, 'latents': lat_concat } } print("training high policy") self.optimize_policy(itr, samples_data_concatenated) high_times += 1 else: self.optimize_policy(itr, samples_data) if not self.train_low: pass # not training low policy elif self.train_low_with_external: print("training low policy with external rewards only") paths_low = [] for idx, path in enumerate(paths): last_low_step_num = len( path["env_infos"]["full_path"]["rewards"][-1]) path_low = dict( observations=np.concatenate( path['env_infos']["full_path"] ["observations"]), actions=np.concatenate( path['env_infos']["full_path"]["actions"]), rewards=np.concatenate( path['env_infos']["full_path"]["rewards"]), ) # WR: trim the observation path_low['observations'] = path_low[ 'observations'][:, :self.low_policy.obs_robot_dim] agent_info_low = dict() for key in path['env_infos']["full_path"][ 'agent_infos']: agent_info_low[key] = np.concatenate( path['env_infos']["full_path"]['agent_infos'] [key]) path_low["agent_infos"] = agent_info_low env_info_low = dict() for key in path['env_infos']["full_path"]['env_infos']: # print(key, path) env_info_low[key] = np.concatenate( path['env_infos']["full_path"]["env_infos"] [key]) path_low["env_infos"] = env_info_low paths_low.append(path_low) real_samples = ext.extract_dict( self.low_sampler.process_samples(itr, paths_low), # I don't need to process the hallucinated samples: the R, A,.. same! "observations", "actions", "advantages", "env_infos", "agent_infos") real_samples["importance_weights"] = np.ones_like( real_samples["advantages"]) self.optimize_policy_low(itr, real_samples) elif self.train_low_with_v_split: print("training low policy with HAAR") # self.discount = self.discount_low paths_low = [] for idx, path in enumerate(paths): last_low_step_num = len( path["env_infos"]["full_path"]["rewards"][-1]) V_high = self.baseline.predict(path) diff_V = np.diff( V_high ) / self.env.time_steps_agg # here we are neglecting gamma in the definition # of Advantage (gamma is close to 1), making the expression essentially the difference in V. # Using the precise definition of A will yield very similar learning curves and does not affect # the outcome of experiments. for i in range(len(diff_V)): # path["env_infos"]["full_path"]["rewards"][i] \ # += np.ones(len(path["env_infos"]["full_path"]["rewards"][i]))*diff_V[i] path["env_infos"]["full_path"]["rewards"][i] \ = np.ones(len(path["env_infos"]["full_path"]["rewards"][i])) * diff_V[i] path_low = dict( observations=np.concatenate( path['env_infos']["full_path"] ["observations"]), actions=np.concatenate( path['env_infos']["full_path"]["actions"]), rewards=np.concatenate( path['env_infos']["full_path"]["rewards"]), ) # cancel the winning rewards for low level! if np.sum( path['env_infos']["full_path"]['env_infos'] ['inner_rew']) == 1: # the episode was successful # the last step should minus the reward of reaching the goal (outer reward) path_low['rewards'][ -1] -= self.env.wrapped_env.wrapped_env.goal_rew # WR: trim the observation path_low['observations'] = path_low[ 'observations'][:, :self.low_policy.obs_robot_dim] agent_info_low = dict() for key in path['env_infos']["full_path"][ 'agent_infos']: agent_info_low[key] = np.concatenate( path['env_infos']["full_path"]['agent_infos'] [key]) path_low["agent_infos"] = agent_info_low env_info_low = dict() for key in path['env_infos']["full_path"]['env_infos']: # print(key, path) env_info_low[key] = np.concatenate( path['env_infos']["full_path"]["env_infos"] [key]) path_low["env_infos"] = env_info_low paths_low.append(path_low) real_samples = ext.extract_dict( self.low_sampler.process_samples(itr, paths_low), # I don't need to process the hallucinated samples: the R, A,.. same! "observations", "actions", "advantages", "env_infos", "agent_infos") real_samples["importance_weights"] = np.ones_like( real_samples["advantages"]) self.optimize_policy_low(itr, real_samples) else: print( 'ERROR! Unknown training mode. See batch_polopt.py for details.' ) logger.log("saving snapshot...") params = self.get_itr_snapshot(itr, samples_data) self.current_itr = itr + 1 params["algo"] = self try: params["time_steps_agg"] = self.env.time_steps_agg except AttributeError: # don't have this attribute pass if self.store_paths: params["paths"] = samples_data["paths"] logger.save_itr_params(itr, params) logger.log("saved") logger.dump_tabular(with_prefix=False) # to prevent memory leakage # info = psutil.virtual_memory() # print ('memory percent', info.percent) # if info.percent > 95: # break if self.plot: self.update_plot() if self.pause_for_plot: input("Plotting evaluation run: Press Enter to " "continue...") self.shutdown_worker()