def process_samples(self, itr, paths, prefix='', log=True, fast_process=False, testitr=False, metalearn_baseline=False, comet_logger=None): baselines = [] returns = [] if testitr: metalearn_baseline = False train_baseline = (itr in BASELINE_TRAINING_ITRS) if not fast_process: for idx, path in enumerate(paths): path["returns"] = special.discount_cumsum( path["rewards"], self.algo.discount) if not fast_process and not metalearn_baseline: if log: logger.log("fitting baseline...") if hasattr(self.algo.baseline, 'fit_with_samples'): self.algo.baseline.fit_with_samples( paths, samples_data) # TODO: doesn't seem like this is ever used else: # print("debug21 baseline before fitting",self.algo.baseline.predict(paths[0])[0:2], "...",self.algo.baseline.predict(paths[0])[-3:-1]) # print("debug23 predloss before fitting",np.mean([np.mean(np.square(p['returns']-self.algo.baseline.predict(p))) for p in paths])) self.algo.baseline.fit(paths, log=log) # print("debug25 predloss AFTER fitting",np.mean([np.mean(np.square(p['returns']-self.algo.baseline.predict(p))) for p in paths])) # print("debug22 returns ",paths[0]['returns'][0:2], "...",paths[0]['returns'][-3:-1]) # print("debug24 baseline after fitting",self.algo.baseline.predict(paths[0])[0:2], "...", self.algo.baseline.predict(paths[0])[-3:-1]) if log: logger.log("fitted") if 'switch_to_init_dist' in dir(self.algo.baseline): self.algo.baseline.switch_to_init_dist() if train_baseline: self.algo.baseline.fit_train_baseline(paths) if hasattr(self.algo.baseline, "predict_n"): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [ self.algo.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): if not fast_process and not metalearn_baseline: # if idx==0: # print("debug22", all_path_baselines[idx]) # print("debug23", path['returns']) path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) baselines.append(path_baselines[:-1]) if not fast_process: returns.append(path["returns"]) if "expert_actions" not in path.keys(): if "expert_actions" in path["env_infos"].keys(): path["expert_actions"] = path["env_infos"][ "expert_actions"] else: # assert False, "you shouldn't need expert_actions" path["expert_actions"] = np.array( [[None] * len(path['actions'][0])] * len(path['actions'])) if not fast_process and not metalearn_baseline: # TODO: we want the ev eventually ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) l2 = np.linalg.norm(np.array(baselines) - np.array(returns)) if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) if not fast_process: rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list( [path["returns"] for path in paths]) if "env_infos" in paths[0].keys(): env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) if not fast_process and not metalearn_baseline: advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) # print("debug, advantages are", advantages,) # print("debug, shape of advantages is", type(advantages), np.shape(advantages)) expert_actions = tensor_utils.concat_tensor_list( [path["expert_actions"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if not fast_process and not metalearn_baseline: if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) if "meta_predict" in dir(self.algo.baseline): # print("debug, advantages are", advantages, ) advantages = advantages + self.algo.baseline.meta_predict( observations) print("debug, metalearned baseline constant is", self.algo.baseline.meta_predict(observations)[0:2], "...", self.algo.baseline.meta_predict(observations)[-3:-1]) # print("debug, metalearned baseline constant shape is", np.shape(self.algo.baseline.meta_predict(observations))) # print("debug, advantages are", advantages[0:2],"...", advantages[-3:-1]) # print("debug, advantages shape is", np.shape(advantages)) # average_discounted_return = \ # np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [ sum(path.get("rewards", [0])) for path in paths ] # ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) if fast_process: samples_data = dict( observations=observations, actions=actions, agent_infos=agent_infos, paths=paths, expert_actions=expert_actions, ) elif metalearn_baseline: samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, agent_infos=agent_infos, paths=paths, expert_actions=expert_actions, ) if 'agent_infos_orig' in paths[0].keys(): agent_infos_orig = tensor_utils.concat_tensor_dict_list( [path["agent_infos_orig"] for path in paths]) samples_data["agent_infos_orig"] = agent_infos_orig else: samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, expert_actions=expert_actions, ) if 'agent_infos_orig' in paths[0].keys(): agent_infos_orig = tensor_utils.concat_tensor_dict_list( [path["agent_infos_orig"] for path in paths]) samples_data["agent_infos_orig"] = agent_infos_orig else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate( [path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.asarray( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path["returns"] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [ sum(path.get("rewards", [0])) for path in paths ] # ent = np.sum(self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) if log and comet_logger: comet_logger.log_metric('StdReturn', np.std(undiscounted_returns)) comet_logger.log_metric('MaxReturn', np.max(undiscounted_returns)) comet_logger.log_metric('MinReturn', np.min(undiscounted_returns)) comet_logger.log_metric('AverageReturn', np.mean(undiscounted_returns)) if log: # logger.record_tabular('Iteration', itr) # logger.record_tabular('AverageDiscountedReturn', # average_discounted_return) logger.record_tabular(prefix + 'AverageReturn', np.mean(undiscounted_returns)) if testitr and prefix == "1": # TODO make this functional for more than 1 iteration self.memory["AverageReturnLastTest"] = np.mean( undiscounted_returns) self.memory["AverageReturnBestTest"] = max( self.memory["AverageReturnLastTest"], self.memory["AverageReturnBestTest"]) if self.memory["AverageReturnBestTest"] == 0.0: self.memory["AverageReturnBestTest"] = self.memory[ "AverageReturnLastTest"] if not fast_process and not metalearn_baseline: logger.record_tabular(prefix + 'ExplainedVariance', ev) logger.record_tabular(prefix + 'BaselinePredLoss', l2) if comet_logger: comet_logger.log_metric('ExplainedVariance', ev) comet_logger.log_metric('BaselinePredLoss', l2) # if comet_logger: # comet_logger.log_metric('ExplainedVariance', ev) # comet_logger.log_metric('BaselinePredLoss', l2) logger.record_tabular(prefix + 'NumTrajs', len(paths)) # logger.record_tabular(prefix + 'Entropy', ent) # logger.record_tabular(prefix + 'Perplexity', np.exp(ent)) logger.record_tabular(prefix + 'StdReturn', np.std(undiscounted_returns)) logger.record_tabular(prefix + 'MaxReturn', np.max(undiscounted_returns)) logger.record_tabular(prefix + 'MinReturn', np.min(undiscounted_returns)) if "env_infos" in paths[0].keys( ) and "success_left" in paths[0]["env_infos"].keys(): logger.record_tabular(prefix + 'success_left', eval_success_left(paths)) logger.record_tabular(prefix + 'success_right', eval_success_right(paths)) if comet_logger: comet_logger.log_metric('success_left', eval_success_left(paths)) comet_logger.log_metric('success_right', eval_success_right(paths)) # else: # logger.record_tabular(prefix + 'success_left', -1.0) # logger.record_tabular(prefix + 'success_right', -1.0) # if metalearn_baseline: # if hasattr(self.algo.baseline, "revert"): # self.algo.baseline.revert() return samples_data
def process_samples(self, itr, paths): if self.algo.ma_mode == 'centralized': return super().process_samples(itr, paths) elif self.algo.ma_mode == 'decentralized': return super().process_samples( itr, list(itertools.chain.from_iterable(paths))) elif self.algo.ma_mode == 'concurrent': processed_samples = [] for ps, policy, baseline in zip(paths, self.algo.policies, self.algo.baselines): baselines = [] returns = [] if hasattr(baseline, "predict_n"): all_path_baselines = baseline.predict_n(ps) else: all_path_baselines = [ baseline.predict(path) for path in ps ] for idx, path in enumerate(ps): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path["returns"] = special.discount_cumsum( path["rewards"], self.algo.discount) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) if not policy.recurrent: observations = tensor_utils.concat_tensor_list( [path["observations"] for path in ps]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in ps]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in ps]) returns = tensor_utils.concat_tensor_list( [path["returns"] for path in ps]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in ps]) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in ps]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in ps]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive( advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in ps]) undiscounted_returns = [ sum(path["rewards"]) for path in ps ] ent = np.mean(policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, ps=ps, ) else: max_path_length = max( [len(path["advantages"]) for path in ps]) # make all ps the same length (pad extra advantages with 0) obs = [path["observations"] for path in ps] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate( [path["advantages"] for path in ps]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in ps] else: adv = [path["advantages"] for path in ps] adv = np.asarray([ tensor_utils.pad_tensor(a, max_path_length) for a in adv ]) actions = [path["actions"] for path in ps] actions = tensor_utils.pad_tensor_n( actions, max_path_length) rewards = [path["rewards"] for path in ps] rewards = tensor_utils.pad_tensor_n( rewards, max_path_length) returns = [path["returns"] for path in ps] returns = tensor_utils.pad_tensor_n( returns, max_path_length) agent_infos = [path["agent_infos"] for path in ps] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path["env_infos"] for path in ps] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in ps] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = \ np.mean([path["returns"][0] for path in ps]) undiscounted_returns = [ sum(path["rewards"]) for path in ps ] ent = np.sum( policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, ps=ps, ) logger.log("fitting baseline...") if hasattr(baseline, 'fit_with_samples'): baseline.fit_with_samples(ps, samples_data) else: baseline.fit(ps) logger.log("fitted") logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(ps)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) processed_samples.append(samples_data) return processed_samples
def process_samples(self, itr, paths): baselines = [] returns = [] for path in paths: path_baselines = np.append(self.algo.baseline.predict(path), 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list([path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list([path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list([path["rewards"] for path in paths]) advantages = tensor_utils.concat_tensor_list([path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list([path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list([path["agent_infos"] for path in paths]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) ev = special.explained_variance_1d( np.concatenate(baselines), np.concatenate(returns) ) samples_data = dict( observations=observations, actions=actions, rewards=rewards, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = np.array([tensor_utils.pad_tensor(ob, max_path_length) for ob in obs]) if self.algo.center_adv: raw_adv = np.concatenate([path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.array([tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = np.array([tensor_utils.pad_tensor(a, max_path_length) for a in actions]) rewards = [path["rewards"] for path in paths] rewards = np.array([tensor_utils.pad_tensor(r, max_path_length) for r in rewards]) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list( [tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos] ) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list( [tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos] ) valids = [np.ones_like(path["returns"]) for path in paths] valids = np.array([tensor_utils.pad_tensor(v, max_path_length) for v in valids]) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.sum(self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) ev = special.explained_variance_1d( np.concatenate(baselines), np.concatenate(returns) ) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) logger.log("fitting baseline...") self.algo.baseline.fit(paths) logger.log("fitted") logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data
def process_samples(self, itr, paths): if itr > 0: surprise = [] for i in range(len(paths)): surprise.append(paths[i]['surprise']) surprise_flat = np.hstack(surprise) logger.record_tabular('Surprise_Mean', np.mean(surprise_flat)) logger.record_tabular('Surprise_Std', np.std(surprise_flat)) logger.record_tabular('Surprise_Min', np.min(surprise_flat)) logger.record_tabular('Surprise_Max', np.max(surprise_flat)) for i in range(len(paths)): paths[i][ 'rewards'] = paths[i]['rewards'] + self.eta * surprise[i] else: logger.record_tabular('Surprise_Mean', 0.) logger.record_tabular('Surprise_Std', 0.) logger.record_tabular('Surprise_Min', 0.) logger.record_tabular('Surprise_Max', 0.) baselines = [] returns = [] for path in paths: path_baselines = np.append(self.baseline.predict(path), 0) deltas = path["rewards"] + \ self.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.discount * self.gae_lambda) path["returns"] = special.discount_cumsum( path["rewards_extrinsic"], self.discount) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if self.center_adv: advantages = util.center_advantages(advantages) if self.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [ sum(path["rewards_extrinsic"]) for path in paths ] ent = np.mean(self.policy.distribution.entropy(agent_infos)) ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) logger.log("fitting baseline...") self.baseline.fit(paths) logger.log("fitted") logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data
def process_samples(self, itr, paths): if not self.initialized: self.initialize() baselines = [] returns = [] if hasattr(self.algo.baseline, "predict_n"): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [ self.algo.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): if hasattr(self.algo, '_kwargs'): if self.mode.startswith('inception'): if idx % 100 == 0: print("paths", idx) imgs = [ img[0] for img in path['env_infos']['imgs'] if img is not None ] feat = self.sess.run(self.model[1][self.layer], {self.image: imgs}) diff = self.means - feat diff[self.std == 0] = 0 diff = diff**2 / (self.std + 1e-5) means = np.mean(diff, axis=(1, 2, 3)) for j in range(25): path["rewards"][j * 2 + 1] -= means[j] * (j**2) elif self.mode == 'oracle': path["rewards"] += path["env_infos"]["reward_true"] elif self.mode.startswith('ours'): imgs = [ img for img in path['env_infos']['imgs'] if img is not None ] if not hasattr(self, 'means'): self.means = [] self.imgs = [] validdata = np.load(self.algo._kwargs['modeldata']) for vp in range(self.nvp): context = imgs[0][vp] timgs = [] tfeats = [] nvideos = validdata.shape[1] if self.mode == 'oursinception': nvideos = 50 for i in range(nvideos): if i % 10 == 0: print("feats", i) skip = 1 if self.name == 'real' or self.name == 'sweep': skip = 2 if self.mode == 'oursinception': input_img = validdata[::skip, i] else: input_img = ((validdata[::skip, i] + 1) * 127.5).astype(np.uint8) tfeat, timg = self.sess.run( [self.model.translated_z, self.model.out], { self.image: [ input_img, [context] * self.batch_size, [context] * self.batch_size ] }) timgs.append(timg) tfeats.append(tfeat) self.means.append(np.mean(tfeats, axis=0)) meanimgs = np.mean(timgs, axis=0) self.imgs.append(meanimgs) # for j in range(25): # scipy.misc.imsave('test/%d_%d.png' %(vp, j), arm_shaping.inverse_transform(meanimgs[j])) if idx % 10 == 0: print("feats", idx) # import IPython # IPython.embed() costs = 0 for vp in range(self.nvp): curimgs = [img[vp] for img in imgs] feats, image_trans = self.sess.run( [self.model.input_z, self.image_trans], { self.image: [ curimgs, [curimgs[0]] * self.batch_size, curimgs ] }) # import IPython # IPython.embed() # for j in range(25): # scipy.misc.imsave('test/' + str(j) + "_recon.png", arm_shaping.inverse_transform(image_recon[j])) # for j in range(25): # scipy.misc.imsave('test/' + str(j) + "_orig.png", arm_shaping.inverse_transform(image_trans[0][j])) if self.ablation_type == "None": costs += np.sum((self.means[vp] - feats)**2, axis = 1) + \ self.algo._kwargs['scale']*np.sum((self.imgs[vp] - image_trans[0])**2, axis = (1, 2, 3)) elif self.ablation_type == "nofeat": costs = self.algo._kwargs['scale'] * np.sum( (self.imgs - image_trans[0])**2, axis=(1, 2, 3)) elif self.ablation_type == "noimage": costs = np.sum((self.means - feats)**2, axis=1) elif self.ablation_type == 'recon': costs = np.sum((self.means - feats)**2, axis = 1) + \ self.algo._kwargs['scale']*np.sum((image_recon - image_trans[0])**2, axis = (1, 2, 3)) # costs = np.sum((self.means - feats)**2, axis = 1) + \ # self.algo._kwargs['scale']*np.sum((self.imgs - image_trans[0])**2, axis = (1, 2, 3)) for j in range(25): path["rewards"][j * 2 + 1] -= costs[j] * (j**2) path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list( [path["returns"] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate( [path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.asarray( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path["returns"] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.sum( self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) logger.log("fitting baseline...") if hasattr(self.algo.baseline, 'fit_with_samples'): self.algo.baseline.fit_with_samples(paths, samples_data) else: self.algo.baseline.fit(paths) logger.log("fitted") logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) if 'reward_true' in paths[0]['env_infos']: trues = [sum(path["env_infos"]["reward_true"]) for path in paths] logger.record_tabular('ReturnTrue', np.mean(trues)) logger.record_tabular('MinTrue', np.min(trues)) logger.record_tabular('MaxTrue', np.max(trues)) logger.record_tabular('ArgmaxTrueReturn', trues[np.argmax(undiscounted_returns)]) # logger.record_tabular('Shaping', np.mean([path["shaping_reward"] for path in paths])) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data
def cgmprocess_samples(self, itr, paths): baselines = [] returns = [] for path in paths: path_baselines = np.swapaxes(self.algo.baseline.predict(path), 0, 1) env_info = path["env_infos"] sampleA = env_info["count"].astype(float) actionsTaken = sampleA.sum(axis=3) sampleS = sampleA.sum(axis=(2, 3)) #+ TINY H = sampleA.shape[0] immediateReward = path["rewards"] zeroIndices = (sampleS == 0) sampleS[zeroIndices] = 1 accumulatedReward = immediateReward * actionsTaken / sampleS[:, :, np. newaxis] temp = accumulatedReward[H - 1].sum(axis=1) for t in xrange(H - 2, -1, -1): accumulatedReward[t] += ( temp[np.newaxis, np.newaxis, :] * sampleA[t]).sum(axis=2) / sampleS[t, :, np.newaxis] temp = accumulatedReward[t].sum(axis=1) sampleS[zeroIndices] = 0 path["returns"] = accumulatedReward.sum( axis=2 ) #special.discount_cumsum(path["rewards"], self.algo.discount) path["advantages"] = accumulatedReward - path[ "returns"][:, :, np.newaxis] #path_baselines baselines.append(path_baselines) temp = np.max(immediateReward) if temp > 0: temp1 = 2 returns.append(path["returns"]) observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) returns = np.concatenate(returns) baselines = np.concatenate(baselines) if not self.algo.policy.recurrent: # if self.algo.center_adv: # advantages = util.center_advantages(advantages) # # if self.algo.positive_adv: # advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [np.sum(path["rewards"]) for path in paths] ent = np.mean(self.algo.policy.entropy(agent_infos)) ev = 3 # special.explained_variance_2d( # baselines,#np.array([[]]),# # returns # ) samples_data = dict( observations=observations, actions=actions, rewards=rewards, advantages=advantages, env_infos=[], agent_infos=agent_infos, paths=paths, ) else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = np.array( [tensor_utils.pad_tensor(ob, max_path_length) for ob in obs]) if self.algo.center_adv: raw_adv = np.concatenate( [path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.array( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = np.array( [tensor_utils.pad_tensor(a, max_path_length) for a in actions]) rewards = [path["rewards"] for path in paths] rewards = np.array( [tensor_utils.pad_tensor(r, max_path_length) for r in rewards]) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in paths] valids = np.array( [tensor_utils.pad_tensor(v, max_path_length) for v in valids]) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) logger.log("fitting baseline...") self.algo.baseline.fit(observations, rewards, returns) logger.log("fitted") logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) AverageReturn = np.mean(undiscounted_returns) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) with open("result.txt", "a") as myfile: myfile.write("%f\n" % AverageReturn) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data
def process_samples(self, itr, paths, prefix='', log=True, task_idx=0, noise_opt=False, joint_opt=False, sess = None): baselines = [] returns = [] for idx, path in enumerate(paths): path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) if log: logger.log("fitting baseline...") if hasattr(self.algo.baseline, 'fit_with_samples'): self.algo.baseline.fit_with_samples(paths, samples_data) else: self.algo.baseline.fit(paths, log=log) if log: logger.log("fitted") if hasattr(self.algo.baseline, "predict_n"): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [self.algo.baseline.predict(path) for path in paths] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) ev = special.explained_variance_1d( np.concatenate(baselines), np.concatenate(returns) ) if joint_opt is True: observations = tensor_utils.concat_tensor_list([path["observations"] for path in paths]) noises = tensor_utils.concat_tensor_list([path["noises"] for path in paths]) task_idxs = task_idx*np.ones((len(noises),), dtype=np.int32) actions = tensor_utils.concat_tensor_list([path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list([path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list([path["returns"] for path in paths]) advantages = tensor_utils.concat_tensor_list([path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list([path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list([path["agent_infos"] for path in paths]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] debug_avg_ret = np.mean(undiscounted_returns) #mean = sess.run(self.algo.policy.all_params["latent_means"]) #std = sess.run(self.algo.policy.all_params["latent_stds"]) #import ipdb #ipdb.set_trace() ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, noises=noises, task_idxs=task_idxs, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) observations_latent = tensor_utils.concat_tensor_list([path["observations"][0:1] for path in paths]) noises_latent = tensor_utils.concat_tensor_list([path["noises"][0:1] for path in paths]) task_idxs_latent = task_idx*np.ones((len(noises_latent),), dtype=np.int32) actions_latent = tensor_utils.concat_tensor_list([path["actions"][0:1] for path in paths]) rewards_latent = tensor_utils.concat_tensor_list([path["rewards"][0:1] for path in paths]) returns_latent = tensor_utils.concat_tensor_list([path["returns"][0:1] for path in paths]) advantages_latent = tensor_utils.concat_tensor_list([path["advantages"][0:1] for path in paths]) env_infos_latent = tensor_utils.concat_tensor_dict_list([path["env_infos"] for path in paths]) agent_infos_latent = tensor_utils.concat_tensor_dict_list([path["agent_infos"] for path in paths]) if self.algo.center_adv: advantages_latent = util.center_advantages(advantages_latent) if self.algo.positive_adv: advantages_latent = util.shift_advantages_to_positive(advantages_latent) samples_data_latent = dict( observations=observations_latent, noises=noises_latent, task_idxs=task_idxs_latent, actions=actions_latent, rewards=rewards_latent, returns=returns_latent, advantages=advantages_latent, env_infos=env_infos_latent, agent_infos=agent_infos_latent, paths=paths, ) elif noise_opt is False: observations = tensor_utils.concat_tensor_list([path["observations"] for path in paths]) noises = tensor_utils.concat_tensor_list([path["noises"] for path in paths]) task_idxs = task_idx*np.ones((len(noises),), dtype=np.int32) actions = tensor_utils.concat_tensor_list([path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list([path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list([path["returns"] for path in paths]) advantages = tensor_utils.concat_tensor_list([path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list([path["env_infos"] for path in paths]) for path in paths: for key in path['agent_infos']: if key == 'prob' and len(path['agent_infos'][key].shape) == 3: path['agent_infos'][key] = path['agent_infos'][key][:,0] agent_infos = tensor_utils.concat_tensor_dict_list([path["agent_infos"] for path in paths]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, noises=noises, task_idxs=task_idxs, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) elif noise_opt is True: observations = tensor_utils.concat_tensor_list([path["observations"][0:1] for path in paths]) noises = tensor_utils.concat_tensor_list([path["noises"][0:1] for path in paths]) task_idxs = task_idx*np.ones((len(noises),), dtype=np.int32) actions = tensor_utils.concat_tensor_list([path["actions"][0:1] for path in paths]) rewards = tensor_utils.concat_tensor_list([path["rewards"][0:1] for path in paths]) returns = tensor_utils.concat_tensor_list([path["returns"][0:1] for path in paths]) advantages = tensor_utils.concat_tensor_list([path["advantages"][0:1] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list([path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list([path["agent_infos"] for path in paths]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, noises=noises, task_idxs=task_idxs, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) if log: #logger.record_tabular('Iteration', itr) #logger.record_tabular('AverageDiscountedReturn', # average_discounted_return) for key in path['env_infos']: info_returns = [sum(path["env_infos"][key]) for path in paths] logger.record_tabular(prefix+'Average'+key, np.mean(info_returns)) logger.record_tabular(prefix+'Max'+key, np.max(info_returns)) logger.record_tabular(prefix+'AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular(prefix+'ExplainedVariance', ev) logger.record_tabular(prefix+'NumTrajs', len(paths)) logger.record_tabular(prefix+'Entropy', ent) logger.record_tabular(prefix+'Perplexity', np.exp(ent)) logger.record_tabular(prefix+'StdReturn', np.std(undiscounted_returns)) logger.record_tabular(prefix+'MaxReturn', np.max(undiscounted_returns)) logger.record_tabular(prefix+'MinReturn', np.min(undiscounted_returns)) if joint_opt is True: return samples_data, samples_data_latent else: return samples_data
def _process_samples(self, itr, paths): baselines = [] returns = [] # compute path baselines all_path_baselines = [ self.algo.baseline.predict(path) for path in paths ] # compute advantages and returns at every timestep for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) # formulate samples data, subselecting timesteps during which an action # was actually taken by the policy observations = tensor_utils.concat_tensor_list( [path["observations"][path['update_idxs']] for path in paths]) actions = tensor_utils.concat_tensor_list( [path["actions"][path['update_idxs']] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list( [path["returns"][path['update_idxs']] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"][path['update_idxs']] for path in paths]) idxs = [path['update_idxs'] for path in paths] hgail.misc.utils.subselect_dict_list_idxs(paths, 'env_infos', idxs) hgail.misc.utils.subselect_dict_list_idxs(paths, 'agent_infos', idxs) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) logger.log("fitting baseline...") self.algo.baseline.fit(paths) logger.log("fitted") undiscounted_returns = [sum(path["rewards"]) for path in paths] average_discounted_return = np.mean( [path["returns"][0] for path in paths]) # bug with computing entropy # ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) ent = 0. logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data
def process_samples(self, itr, paths): baselines = [] returns = [] n = len(paths[0]["rewards"]) for i in range(n): baselines.append([]) returns.append([]) if len(paths) > 0 and "vf" in paths[0]["agent_infos"]: all_path_baselines = [ p["agent_infos"]["vf"].flatten() for p in paths ] else: if hasattr(self.algo.baseline, "predict_n"): raise NotImplementedError else: all_path_baselines = [[ self.algo.NPOs[i].baseline.predict(path, idx=i) for i in range(n) ] for path in paths] for idx, path in enumerate(paths): path["advantages"] = [] path["returns"] = [] for i in range(n): path_baselines = np.append(all_path_baselines[idx][i], 0) deltas = path["rewards"][i] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"].append( special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda)) path["returns"].append( special.discount_cumsum(path["rewards"][i], self.algo.discount)) baselines[i].append(path_baselines[:-1]) returns[i].append(path["returns"][i]) ev = [ special.explained_variance_1d(np.concatenate(baselines[i]), np.concatenate(returns[i])) for i in range(n) ] if not self.algo.policy.recurrent: tensor_concat = lambda key: [ tensor_utils.concat_tensor_list(x) for x in regroup([path[key] for path in paths]) ] tensor_concat_d = lambda key: [ tensor_utils.concat_tensor_dict_list(x) for x in regroup([path[key] for path in paths]) ] observations_n = tensor_concat("observations") actions_n = tensor_concat("actions") rewards_n = tensor_concat("rewards") returns_n = tensor_concat("returns") advantages_n = tensor_concat("advantages") # env_infos_n = tensor_concat_d("env_infos") agent_infos_n = tensor_concat_d("agent_infos") # TODO(cathywu) make consistent with the rest (above)? # env_infos = tensor_utils.concat_tensor_dict_list([path["env_infos"] for path in paths]) if self.algo.center_adv: advantages_n = [ util.center_advantages(advantages) for advantages in advantages_n ] if self.algo.positive_adv: advantages_n = [ util.shift_advantages_to_positive(advantages) for advantages in advantages_n ] average_discounted_return = \ np.mean([sum(path["returns"][i][0] for i in range(n)) for path in paths]) undiscounted_returns = [ sum(sum(path["rewards"])) for path in paths ] ent = np.mean( self.algo.policy.get_distribution(idx=0).entropy( agent_infos_n[0])) samples_data_n = [ dict( observations=observations_n[i], actions=actions_n[i], rewards=rewards_n[i], returns=returns_n[i], advantages=advantages_n[i], # env_infos=env_infos, agent_infos=agent_infos_n[i], # paths=paths, ) for i in range(n) ] else: return NotImplementedError logger.log("fitting baseline...") if hasattr(self.algo.baseline, 'fit_with_samples'): raise NotImplementedError else: for idx in range(len(self.algo.NPOs)): self.algo.NPOs[idx].baseline.fit(paths, idx=idx) logger.log("fitted") logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) for i in range(len(ev)): logger.record_tabular('ExplainedVariance-k%d' % i, ev[i]) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular_misc_stat('TrajLen', [len(p["rewards"][0]) for p in paths], placement='front') logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular_misc_stat('Return', undiscounted_returns, placement='front') return samples_data_n
def record_statistics(self, itr, paths, baselines, returns): evs = [ special.explained_variance_1d(np.concatenate(baselines[i]), np.concatenate(returns[i])) for i in range(len(baselines)) ] evs = evs[::-1] average_discounted_return, undiscounted_returns, ent = self.statistics_for_new_paths( ) logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('ExplainedVariance', evs[0]) logger.record_tabular('NumBatches', len(self.experience_replay)) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('MeanPathLen', self.mean_path_len) logger.record_tabular('EnvInteracts', self.env_interacts) logger.record_tabular('TotalEnvInteracts', self.total_env_interacts) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) if self.algo.batch_aggregate_n > 1: """ for age, raw_weight, weight in zip(self.age, self.raw_weights, self.weights): logger.record_tabular('RawWeight_age_' + str(age),raw_weight) logger.record_tabular('ScaledWeight_age_' + str(age), weight) if age > 0 and self.algo.importance_sampling: IS = tensor_utils.concat_tensor_list(self.IS_coeffs[age]) logger.record_tabular('MeanISCoeff_age_' + str(age),np.mean(IS)) logger.record_tabular('StdISCoeff_age_' + str(age),np.std(IS)) logger.record_tabular('MaxISCoeff_age_' + str(age),np.max(IS)) logger.record_tabular('MinISCoeff_age_' + str(age),np.min(IS)) if age > 0: logger.record_tabular('ExplainedVariance_age_'+str(age),evs[age]) """ for age in range(self.algo.batch_aggregate_n): if age < len(self.experience_replay): raw_weight = self.raw_weights[::-1][age] weight = self.weights[::-1][age] logger.record_tabular('RawWeight_age_' + str(age), raw_weight) logger.record_tabular('ScaledWeight_age_' + str(age), weight) if age > 0 and self.algo.importance_sampling: IS = self.get_IS(age) logger.record_tabular('MeanISCoeff_age_' + str(age), np.mean(IS)) logger.record_tabular('StdISCoeff_age_' + str(age), np.std(IS)) logger.record_tabular('MaxISCoeff_age_' + str(age), np.max(IS)) logger.record_tabular('MinISCoeff_age_' + str(age), np.min(IS)) logger.record_tabular('ExplainedVariance_age_' + str(age), evs[age]) else: logger.record_tabular('RawWeight_age_' + str(age), 0) logger.record_tabular('ScaledWeight_age_' + str(age), 0) if age > 0 and self.algo.importance_sampling: logger.record_tabular('MeanISCoeff_age_' + str(age), 0) logger.record_tabular('StdISCoeff_age_' + str(age), 0) logger.record_tabular('MaxISCoeff_age_' + str(age), 0) logger.record_tabular('MinISCoeff_age_' + str(age), 0) logger.record_tabular('ExplainedVariance_age_' + str(age), 0) if self.algo.exploration_bonus: bonuses = tensor_utils.concat_tensor_list( [path["bonuses"] for path in paths]) logger.record_tabular('MeanRawBonus', self.bonus_mean) logger.record_tabular('MeanBonus', np.mean(bonuses)) logger.record_tabular('StdBonus', np.std(bonuses)) logger.record_tabular('MaxBonus', np.max(bonuses)) bonus_sums = np.array([np.sum(path["bonuses"]) for path in paths]) logger.record_tabular('MeanBonusSum', np.mean(bonus_sums)) logger.record_tabular('StdBonusSum', np.std(bonus_sums)) if self.algo.batch_aggregate_n > 1: new_bonuses = tensor_utils.concat_tensor_list( [path["bonuses"] for path in self.experience_replay[-1]]) logger.record_tabular('NewPathsMeanBonus', np.mean(new_bonuses)) logger.record_tabular('NewPathsStdBonus', np.std(new_bonuses)) logger.record_tabular('NewPathsMaxBonus', np.max(new_bonuses))
def process_samples(self, itr, paths): baselines = [] returns = [] env_returns = [] #ipdb.set_trace() if hasattr(self.algo.baseline, "predict_n"): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [ self.algo.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) if path.has_key("env_rewards"): path["env_returns"] = special.discount_cumsum( path["env_rewards"], self.algo.discount) env_returns.append(path["env_returns"]) #ipdb.set_trace() concat_baselines = np.concatenate(baselines) concat_returns = np.concatenate(returns) ev = special.explained_variance_1d(concat_baselines, concat_returns) if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list( [path["returns"] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) env_returns = [ path["env_rewards"] for path in paths if path.has_key("env_rewards") ] if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) average_discounted_env_return = \ np.mean([path["env_returns"][0] for path in paths if path.has_key("env_returns")]) # why zero? undiscounted_returns = [sum(path["rewards"]) for path in paths] undiscounted_env_returns = [ sum(path["env_rewards"]) for path in paths if path.has_key("env_rewards") ] ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate( [path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.asarray( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path["returns"] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) env_returns = [ path["env_rewards"] for path in paths if path.has_key("env_rewards") ] env_returns = tensor_utils.pad_tensor_n(env_returns, max_path_length) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) average_discounted_env_return = \ np.mean([path["env_returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] undiscounted_env_returns = [ sum(path["env_rewards"]) for path in paths if path.has_key("env_rewards") ] ent = np.sum( self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) logger.log("fitting baseline...") if hasattr(self.algo.baseline, 'fit_with_samples'): self.algo.baseline.fit_with_samples(paths, samples_data) else: self.algo.baseline.fit(paths) logger.log("fitted") logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) if not np.isnan(average_discounted_env_return): logger.record_tabular("AverageDiscountedEnvReturn", average_discounted_env_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) if not np.isnan(np.mean(undiscounted_env_returns)): logger.record_tabular('AverageEnvReturn', np.mean(undiscounted_env_returns)) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) logger.record_tabular('ConcatReturns', concat_returns) logger.record_tabular('ConcatBaselines', concat_baselines) if undiscounted_env_returns != []: logger.record_tabular('StdEnvReturn', np.std(undiscounted_env_returns)) logger.record_tabular('MaxEnvReturn', np.max(undiscounted_env_returns)) logger.record_tabular('MinEnvReturn', np.min(undiscounted_env_returns)) return samples_data
def process_samples(self, itr, paths): # IMPORTANT: # Rewards accrued from a_t to a_t+1 are expected to be discounted by # the environment to values at time t #paths = list(itertools.chain.from_iterable(paths)) baselines = [] returns = [] if hasattr(self.algo.baseline, "predict_n"): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [ self.algo.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): t_sojourn = path["offset_t_sojourn"] gamma = self.algo.discount lamda = self.algo.gae_lambda discount_gamma = np.exp(-gamma * t_sojourn) discount_gamma_lambda = np.exp(-gamma * lamda * t_sojourn) path_baselines = np.append(all_path_baselines[idx], 0) if (len(path["rewards"]) != len(t_sojourn)): # TODO HANDLE INFINITE HORIZON GAMES pdb.set_trace() deltas = path["rewards"] + \ discount_gamma * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = variable_discount_cumsum( deltas, discount_gamma_lambda) path["returns"] = variable_discount_cumsum(path["rewards"], discount_gamma) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list( [path["returns"] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate( [path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.asarray( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path["returns"] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.sum( self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) logger.log("fitting baseline...") if hasattr(self.algo.baseline, 'fit_with_samples'): self.algo.baseline.fit_with_samples(paths, samples_data) else: self.algo.baseline.fit(paths) logger.log("fitted") logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data
def process_samples(self, itr, paths, update_baseline=True): baselines = [] returns = [] if hasattr(self.algo.baseline, "predict_n"): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [ self.algo.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) if hasattr(self.algo, 'epopt_epsilon'): if self.algo.epopt_epsilon < 1.0 and self.algo.epopt_after_iter <= itr: # prune the paths target_path_size = len(paths) * self.algo.epopt_epsilon sorted_indices = np.argsort( [path["returns"][0] for path in paths]) idx = 0 si_idx = 0 while True: if sorted_indices[si_idx] > target_path_size: paths.pop(idx) idx -= 1 idx += 1 si_idx += 1 if idx >= len(paths): break ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list( [path["returns"] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [] ct = 0 for path in paths: if path['env_infos']['dyn_model_id'][-1] == 0: undiscounted_returns.append(sum(path["rewards"])) if path['env_infos']['dyn_model_id'][-1] == 1: ct += 1 print('path count with fake dynamics: ', ct, len(undiscounted_returns), len(paths)) ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate( [path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.asarray( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path["returns"] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.sum( self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) if update_baseline: logger.log("fitting baseline...") if hasattr(self.algo.baseline, 'fit_with_samples'): self.algo.baseline.fit_with_samples(paths, samples_data) else: self.algo.baseline.fit(paths) logger.log("fitted") logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data
def process_samples_skill_dependent(self, itr, paths): # need to generate the correct observations using the outer product new_paths = [] for i in range(len(paths)): latents = paths[i]['agent_infos']['latents'] observations = paths[i]['observations'] # insert the time_remaining time_remaining = paths[i]['agent_infos']['time_remaining'].reshape( len(observations), 1) extended_obs = np.concatenate([observations, time_remaining], axis=1) # new_observations = np.matmul(observations[:, :, np.newaxis], latents[:, np.newaxis, :]).reshape(observations.shape[0], -1) new_observations = np.matmul(extended_obs[:, :, np.newaxis], latents[:, np.newaxis, :]).reshape( extended_obs.shape[0], -1) new_observations = np.concatenate( [new_observations, extended_obs, latents], axis=1) new_paths.append( dict(observations=new_observations, rewards=paths[i]['rewards'], returns=paths[i]['returns'])) paths = new_paths baselines = [] returns = [] if hasattr(self.algo.skill_dependent_baseline, "predict_n"): all_path_baselines = self.algo.skill_dependent_baseline.predict_n( paths) else: all_path_baselines = [ self.algo.skill_dependent_baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) if not self.algo.policy.recurrent: advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) samples_data = dict(advantages=advantages, ) else: max_path_length = max([len(path["advantages"]) for path in paths]) if self.algo.center_adv: raw_adv = np.concatenate( [path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.asarray( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) samples_data = dict(advantages=adv, ) logger.log("fitting skill-depdendent baseline...") if hasattr(self.algo.skill_dependent_baseline, 'fit_with_samples'): self.algo.skill_dependent_baseline.fit_with_samples( paths, samples_data) else: self.algo.skill_dependent_baseline.fit(paths) logger.log("fitted skill-dependent baseline") logger.record_tabular('SkillBaselineExplainedVariance', ev) return samples_data
def process_single_batch(self, paths): if hasattr(self.algo.baseline, "predict_n"): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [self.algo.baseline.predict(path) for path in paths] if self.use_safety_baselines: all_path_safety_baselines = \ [self.algo.safety_constraint.baseline.predict(path) for path in paths] for idx, path in enumerate(paths): if "weights" not in path: path["weights"] = np.ones_like(path["rewards"]) path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] """exploration bonuses""" if self.algo.exploration_bonus: path["bonuses"] *= self.algo.exploration_lambda if self.algo.normalize_bonus: path["bonuses"] /= max(1,np.abs(self.bonus_mean)) if self.algo.nonnegative_bonus_mean: path["bonuses"] -= self.bonus_baseline deltas += path["bonuses"] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) """safety constraint values""" if self.algo.safety_constraint: path["safety_returns"] = \ special.discount_cumsum(path["safety_rewards"],self.algo.safety_discount) if self.use_safety_bonus: path["safety_robust_rewards"] = path["safety_rewards"] + path["safety_bonuses"] path["safety_robust_returns"] = \ special.discount_cumsum(path["safety_robust_rewards"],self.algo.safety_discount) if self.use_safety_baselines: path_safety_baselines = np.append(all_path_safety_baselines[idx],0) safety_deltas = path["safety_rewards"] + \ self.algo.safety_discount * path_safety_baselines[1:] - \ path_safety_baselines[:-1] path["safety_advantages"] = special.discount_cumsum( safety_deltas, self.algo.safety_discount * self.algo.safety_gae_lambda) if self.use_safety_bonus and self.use_safety_baselines: safety_robust_deltas = path["safety_robust_rewards"] + \ self.algo.safety_discount * path_safety_baselines[1:] - \ path_safety_baselines[:-1] path["safety_robust_advantages"] = special.discount_cumsum( safety_robust_deltas, self.algo.safety_discount * self.algo.safety_gae_lambda) if self.algo.safety_tradeoff: if not(self.use_safety_bonus): safety_reward_key = 'safety_rewards' else: safety_reward_key = 'safety_robust_rewards' tradeoff_rewards = path["rewards"] - self.algo.safety_tradeoff_coeff * path[safety_reward_key] path["tradeoff_rewards"] = tradeoff_rewards path["tradeoff_returns"] = special.discount_cumsum(tradeoff_rewards, self.algo.discount) if self.algo.pdo_vf_mode == 1: tradeoff_deltas = deltas - self.algo.safety_tradeoff_coeff * path[safety_reward_key] path["advantages"] = special.discount_cumsum( tradeoff_deltas, self.algo.discount * self.algo.gae_lambda) else: if not(self.use_safety_bonus): tradeoff_deltas = deltas - self.algo.safety_tradeoff_coeff * safety_deltas else: tradeoff_deltas = deltas - self.algo.safety_tradeoff_coeff * safety_robust_deltas path["advantages"] = special.discount_cumsum( tradeoff_deltas, self.algo.discount * self.algo.gae_lambda) ev = special.explained_variance_1d( np.concatenate(all_path_baselines), np.concatenate([path[self.algo.baseline._target_key] for path in paths]) ) return ev
def process_samples(self, itr, paths): if self.normalize_reward: # Update reward mean/std Q. rewards = [] for i in range(len(paths)): rewards.append(paths[i]['rewards']) rewards_flat = np.hstack(rewards) self._reward_mean.append(np.mean(rewards_flat)) self._reward_std.append(np.std(rewards_flat)) # Normalize rewards. reward_mean = np.mean(np.asarray(self._reward_mean)) reward_std = np.mean(np.asarray(self._reward_std)) for i in range(len(paths)): paths[i]['rewards'] = (paths[i]['rewards'] - reward_mean) / (reward_std + 1e-8) if itr > 0: kls = [] for i in range(len(paths)): kls.append(paths[i]['KL']) kls_flat = np.hstack(kls) logger.record_tabular('Expl_MeanKL', np.mean(kls_flat)) logger.record_tabular('Expl_StdKL', np.std(kls_flat)) logger.record_tabular('Expl_MinKL', np.min(kls_flat)) logger.record_tabular('Expl_MaxKL', np.max(kls_flat)) # Perform normlization of the intrinsic rewards. if self.use_kl_ratio: if self.use_kl_ratio_q: # Update kl Q self.kl_previous.append(np.median(np.hstack(kls))) previous_mean_kl = np.mean(np.asarray(self.kl_previous)) for i in range(len(kls)): kls[i] = kls[i] / previous_mean_kl # Add KL ass intrinsic reward to external reward for i in range(len(paths)): paths[i]['rewards'] = paths[i]['rewards'] + self.eta * kls[i] # Discount eta self.eta *= self.eta_discount else: logger.record_tabular('Expl_MeanKL', 0.) logger.record_tabular('Expl_StdKL', 0.) logger.record_tabular('Expl_MinKL', 0.) logger.record_tabular('Expl_MaxKL', 0.) baselines = [] returns = [] for path in paths: path_baselines = np.append(self.baseline.predict(path), 0) deltas = path["rewards"] + \ self.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.discount * self.gae_lambda) path["returns"] = special.discount_cumsum(path["rewards_orig"], self.discount) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) if not self.policy.recurrent: observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if self.center_adv: advantages = util.center_advantages(advantages) if self.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [ sum(path["rewards_orig"]) for path in paths ] ent = np.mean(self.policy.distribution.entropy(agent_infos)) ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = np.array( [tensor_utils.pad_tensor(ob, max_path_length) for ob in obs]) if self.center_adv: raw_adv = np.concatenate( [path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.array( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = np.array( [tensor_utils.pad_tensor(a, max_path_length) for a in actions]) rewards = [path["rewards"] for path in paths] rewards = np.array( [tensor_utils.pad_tensor(r, max_path_length) for r in rewards]) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in paths] valids = np.array( [tensor_utils.pad_tensor(v, max_path_length) for v in valids]) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.mean(self.policy.distribution.entropy(agent_infos)) ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) logger.log("fitting baseline...") self.baseline.fit(paths) logger.log("fitted") logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data
def process_samples(self, itr, paths): baselines = [] returns = [] if hasattr(self.baseline, "predict_n"): all_path_baselines = self.baseline.predict_n(paths) else: all_path_baselines = [self.baseline.predict(path) for path in paths] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.discount * self.gae_lambda) path["returns"] = special.discount_cumsum(path["rewards"], self.discount) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) ev = special.explained_variance_1d( np.concatenate(baselines), np.concatenate(returns) ) # if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list([path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list([path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list([path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list([path["returns"] for path in paths]) advantages = tensor_utils.concat_tensor_list([path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list([path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list([path["agent_infos"] for path in paths]) if self.center_adv: advantages = util.center_advantages(advantages) if self.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.mean(self.policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) logger.log("fitting baseline...") if hasattr(self.baseline, 'fit_with_samples'): self.baseline.fit_with_samples(paths, samples_data) else: self.baseline.fit(paths) logger.log("fitted") with logger.tabular_prefix('Low_'): logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data
def process_samples(self, itr, paths): baselines = [] returns = [] paths = list(itertools.chain.from_iterable(paths)) all_path_baselines = [self.algo.baseline.predict(path) for path in paths] for index, path in enumerate(paths): path_baselines = np.append(all_path_baselines[index], 0) total_return = self.get_total_discounted_returns(path["rewards"]) path["advantages"] = total_return - all_path_baselines[index] path["returns"] = total_return baselines.append(path_baselines[:-1]) returns.append(path["returns"]) ev = special.explained_variance_1d( np.concatenate(baselines), np.concatenate(returns) ) observations = tensor_utils.concat_tensor_list([path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list([path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list([path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list([path["returns"] for path in paths]) advantages = tensor_utils.concat_tensor_list([path["advantages"] for path in paths]) env_info = tensor_utils.concat_tensor_dict_list([path["env_infos"] for path in paths]) agent_info = tensor_utils.concat_tensor_dict_list([path["agent_infos"] for path in paths]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = np.mean([path["returns"][0] for path in paths]) un_discounted_returns = [sum(path["rewards"]) for path in paths] ent = np.mean(self.algo.policy.distribution.entropy(agent_info)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_info=env_info, agent_info=agent_info, paths=paths, ) logger.log("fitting baseline...") self.algo.baseline.fit(paths) logger.log("fitted") logger.record_tabular('Iteration', itr) logger.record_tabular('NofTrajectories', samples_data['observations'].shape[0]) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(un_discounted_returns)) logger.record_tabular('TotalReturn', np.sum(un_discounted_returns)) logger.record_tabular('MaxReturn', np.max(un_discounted_returns)) logger.record_tabular('MinReturn', np.min(un_discounted_returns)) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(un_discounted_returns)) return samples_data
def process_samples(self, itr, paths): baselines = [] returns = [] for path in paths: path_baselines = np.append(self.algo.baseline.predict(path), 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = np.array( [tensor_utils.pad_tensor(ob, max_path_length) for ob in obs]) if self.algo.center_adv: raw_adv = np.concatenate( [path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.array( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = np.array( [tensor_utils.pad_tensor(a, max_path_length) for a in actions]) rewards = [path["rewards"] for path in paths] rewards = np.array( [tensor_utils.pad_tensor(r, max_path_length) for r in rewards]) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in paths] valids = np.array( [tensor_utils.pad_tensor(v, max_path_length) for v in valids]) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [np.sum(path["rewards"]) for path in paths] ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) logger.log("fitting baseline...") self.algo.baseline.fit(paths) logger.log("fitted") logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data
def process_samples(self, itr, paths): baselines = [] returns = [] #---------------Updated by myself--------------- violation_cost = [] boundary_violation_cost = [] succ_rate = 0 succ_return = [] # log_liklihood=[] if hasattr(self.algo.baseline, "predict_n"): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [ self.algo.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path["rewards"] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path["returns"] = special.discount_cumsum(path["rewards"], self.algo.discount) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) #-------- Updated by myself---------------- violation_cost.append(path["violation_cost"]) boundary_violation_cost.append(path["boundary_violation_cost"]) # log_liklihood.append(path["log_likelihood"]) succ_rate += path["succ_rate"] if not (path["succ_return"] == 0): succ_return.append(path["succ_return"]) succ_rate = succ_rate / len(paths) ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) if not self.algo.policy.recurrent: observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) returns = tensor_utils.concat_tensor_list( [path["returns"] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.mean(self.algo.policy.distribution.entropy(agent_infos)) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, violation_cost=np.array(violation_cost), boundary_violation_cost=np.array(boundary_violation_cost), success_rate=succ_rate, successful_AverageReturn=np.array(succ_return), ) else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate( [path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.asarray( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path["rewards"] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path["returns"] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path["returns"]) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.sum( self.algo.policy.distribution.entropy(agent_infos) * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, violation_cost=np.array(violation_cost), boundary_violation_cost=np.array(boundary_violation_cost), success_rate=succ_rate, successful_AverageReturn=np.array(succ_return), ) logger.log("fitting baseline...") if hasattr(self.algo.baseline, 'fit_with_samples'): self.algo.baseline.fit_with_samples(paths, samples_data) else: self.algo.baseline.fit(paths) logger.log("fitted") logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) analysis_data = dict(Iteration=itr, AverageDiscountedReturn=average_discounted_return, AverageReturn=np.mean(undiscounted_returns), ExplainedVariance=ev, NumTrajs=len(paths), Entropy=ent, Perplexity=np.exp(ent), StdReturn=np.std(undiscounted_returns), MaxReturn=np.max(undiscounted_returns), MinReturn=np.min(undiscounted_returns)) return samples_data, analysis_data
def process_samples(self, itr, paths): baselines = [] returns = [] if hasattr(self.algo.baseline, 'predict_n'): all_path_baselines = self.algo.baseline.predict_n(paths) else: all_path_baselines = [ self.algo.baseline.predict(path) for path in paths ] for idx, path in enumerate(paths): path_baselines = np.append(all_path_baselines[idx], 0) deltas = path['rewards'] + \ self.algo.discount * path_baselines[1:] - \ path_baselines[:-1] path['advantages'] = special.discount_cumsum( deltas, self.algo.discount * self.algo.gae_lambda) path['returns'] = special.discount_cumsum(path['rewards'], self.algo.discount) baselines.append(path_baselines[:-1]) returns.append(path['returns']) ev = special.explained_variance_1d(np.concatenate(baselines), np.concatenate(returns)) if not self.algo.policy.recurrent: # Concatenate all samples into one tensor observations = tensor_utils.concat_tensor_list( [path['observations'] for path in paths]) actions = tensor_utils.concat_tensor_list( [path['actions'] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path['rewards'] for path in paths]) returns = tensor_utils.concat_tensor_list( [path['returns'] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path['advantages'] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path['env_infos'] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path['agent_infos'] for path in paths]) if self.algo.center_adv: advantages = util.center_advantages(advantages) if self.algo.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path['returns'][0] for path in paths]) undiscounted_returns = [sum(path['rewards']) for path in paths] ent = np.mean(self.algo.policy.distribution(agent_infos).entropy()) samples_data = dict( observations=observations, actions=actions, rewards=rewards, returns=returns, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) else: max_path_length = max([len(path['advantages']) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path['observations'] for path in paths] obs = tensor_utils.pad_tensor_n(obs, max_path_length) if self.algo.center_adv: raw_adv = np.concatenate( [path['advantages'] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [(path['advantages'] - adv_mean) / adv_std for path in paths] else: adv = [path['advantages'] for path in paths] adv = np.asarray( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path['actions'] for path in paths] actions = tensor_utils.pad_tensor_n(actions, max_path_length) rewards = [path['rewards'] for path in paths] rewards = tensor_utils.pad_tensor_n(rewards, max_path_length) returns = [path['returns'] for path in paths] returns = tensor_utils.pad_tensor_n(returns, max_path_length) agent_infos = [path['agent_infos'] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in agent_infos ]) env_infos = [path['env_infos'] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list([ tensor_utils.pad_tensor_dict(p, max_path_length) for p in env_infos ]) valids = [np.ones_like(path['returns']) for path in paths] valids = tensor_utils.pad_tensor_n(valids, max_path_length) average_discounted_return = \ np.mean([path['returns'][0] for path in paths]) undiscounted_returns = [sum(path['rewards']) for path in paths] ent = np.sum( self.algo.policy.distribution(agent_info).entropy() * valids) / np.sum(valids) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, returns=returns, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) logger.log('fitting baseline...') if hasattr(self.algo.baseline, 'fit_with_samples'): self.algo.baseline.fit_with_samples(paths, samples_data) else: self.algo.baseline.fit(paths) logger.log('fitted') logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data
def process_samples(self, itr, paths): if self.normalize_reward: # Update reward mean/std Q. rewards = [] for i in xrange(len(paths)): rewards.append(paths[i]['rewards']) rewards_flat = np.hstack(rewards) self._reward_mean.append(np.mean(rewards_flat)) self._reward_std.append(np.std(rewards_flat)) # Normalize rewards. reward_mean = np.mean(np.asarray(self._reward_mean)) reward_std = np.mean(np.asarray(self._reward_std)) for i in xrange(len(paths)): paths[i]['rewards'] = ( paths[i]['rewards'] - reward_mean) / (reward_std + 1e-8) if itr > 0: kls = [] for i in xrange(len(paths)): kls.append(paths[i]['KL']) kls_flat = np.hstack(kls) logger.record_tabular('Expl_MeanKL', np.mean(kls_flat)) logger.record_tabular('Expl_StdKL', np.std(kls_flat)) logger.record_tabular('Expl_MinKL', np.min(kls_flat)) logger.record_tabular('Expl_MaxKL', np.max(kls_flat)) # Perform normlization of the intrinsic rewards. if self.use_kl_ratio: if self.use_kl_ratio_q: # Update kl Q self.kl_previous.append(np.median(np.hstack(kls))) previous_mean_kl = np.mean(np.asarray(self.kl_previous)) for i in xrange(len(kls)): kls[i] = kls[i] / previous_mean_kl # Add KL ass intrinsic reward to external reward for i in xrange(len(paths)): paths[i]['rewards'] = paths[i]['rewards'] + self.eta * kls[i] # Discount eta self.eta *= self.eta_discount else: logger.record_tabular('Expl_MeanKL', 0.) logger.record_tabular('Expl_StdKL', 0.) logger.record_tabular('Expl_MinKL', 0.) logger.record_tabular('Expl_MaxKL', 0.) baselines = [] returns = [] for path in paths: path_baselines = np.append(self.baseline.predict(path), 0) deltas = path["rewards"] + \ self.discount * path_baselines[1:] - \ path_baselines[:-1] path["advantages"] = special.discount_cumsum( deltas, self.discount * self.gae_lambda) path["returns"] = special.discount_cumsum( path["rewards_orig"], self.discount) baselines.append(path_baselines[:-1]) returns.append(path["returns"]) if not self.policy.recurrent: observations = tensor_utils.concat_tensor_list( [path["observations"] for path in paths]) actions = tensor_utils.concat_tensor_list( [path["actions"] for path in paths]) rewards = tensor_utils.concat_tensor_list( [path["rewards"] for path in paths]) advantages = tensor_utils.concat_tensor_list( [path["advantages"] for path in paths]) env_infos = tensor_utils.concat_tensor_dict_list( [path["env_infos"] for path in paths]) agent_infos = tensor_utils.concat_tensor_dict_list( [path["agent_infos"] for path in paths]) if self.center_adv: advantages = util.center_advantages(advantages) if self.positive_adv: advantages = util.shift_advantages_to_positive(advantages) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [ sum(path["rewards_orig"]) for path in paths] ent = np.mean(self.policy.distribution.entropy(agent_infos)) ev = special.explained_variance_1d( np.concatenate(baselines), np.concatenate(returns) ) samples_data = dict( observations=observations, actions=actions, rewards=rewards, advantages=advantages, env_infos=env_infos, agent_infos=agent_infos, paths=paths, ) else: max_path_length = max([len(path["advantages"]) for path in paths]) # make all paths the same length (pad extra advantages with 0) obs = [path["observations"] for path in paths] obs = np.array( [tensor_utils.pad_tensor(ob, max_path_length) for ob in obs]) if self.center_adv: raw_adv = np.concatenate( [path["advantages"] for path in paths]) adv_mean = np.mean(raw_adv) adv_std = np.std(raw_adv) + 1e-8 adv = [ (path["advantages"] - adv_mean) / adv_std for path in paths] else: adv = [path["advantages"] for path in paths] adv = np.array( [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) actions = [path["actions"] for path in paths] actions = np.array( [tensor_utils.pad_tensor(a, max_path_length) for a in actions]) rewards = [path["rewards"] for path in paths] rewards = np.array( [tensor_utils.pad_tensor(r, max_path_length) for r in rewards]) agent_infos = [path["agent_infos"] for path in paths] agent_infos = tensor_utils.stack_tensor_dict_list( [tensor_utils.pad_tensor_dict( p, max_path_length) for p in agent_infos] ) env_infos = [path["env_infos"] for path in paths] env_infos = tensor_utils.stack_tensor_dict_list( [tensor_utils.pad_tensor_dict( p, max_path_length) for p in env_infos] ) valids = [np.ones_like(path["returns"]) for path in paths] valids = np.array( [tensor_utils.pad_tensor(v, max_path_length) for v in valids]) average_discounted_return = \ np.mean([path["returns"][0] for path in paths]) undiscounted_returns = [sum(path["rewards"]) for path in paths] ent = np.mean(self.policy.distribution.entropy(agent_infos)) ev = special.explained_variance_1d( np.concatenate(baselines), np.concatenate(returns) ) samples_data = dict( observations=obs, actions=actions, advantages=adv, rewards=rewards, valids=valids, agent_infos=agent_infos, env_infos=env_infos, paths=paths, ) logger.log("fitting baseline...") self.baseline.fit(paths) logger.log("fitted") logger.record_tabular('Iteration', itr) logger.record_tabular('AverageDiscountedReturn', average_discounted_return) logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) logger.record_tabular('ExplainedVariance', ev) logger.record_tabular('NumTrajs', len(paths)) logger.record_tabular('Entropy', ent) logger.record_tabular('Perplexity', np.exp(ent)) logger.record_tabular('StdReturn', np.std(undiscounted_returns)) logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) logger.record_tabular('MinReturn', np.min(undiscounted_returns)) return samples_data