def calculate_loss(self, ob, last_ob, acs, feats=None, last_feat=None): n_chunks = 8 n = ob.shape[0] chunk_size = n // n_chunks assert n % n_chunks == 0 sli = lambda i: slice(i * chunk_size, (i + 1) * chunk_size) print("calculating dyn loss") if self.auxiliary_task.features_shared_with_policy and self.auxiliary_task.policy.lstm1_size: return np.concatenate([ getsess().run( self.loss, { self.auxiliary_task.ph_features: feats[sli(i)], self.auxiliary_task.ph_last_features: np.expand_dims(last_feat[sli(i)], axis=1), self.obs: ob[sli(i)], self.last_ob: last_ob[sli(i)], self.ac: acs[sli(i)] }) for i in range(n_chunks) ], 0) else: return np.concatenate([ getsess().run( self.loss, { self.obs: ob[sli(i)], self.last_ob: last_ob[sli(i)], self.ac: acs[sli(i)] }) for i in range(n_chunks) ], 0)
def get_ac_value_nlp_eval(self, ob): feed_dict = {self.ph_ob: ((ob,),), self.c_in_1: self.lstm1_c_eval, self.h_in_1: self.lstm1_h_eval} if self.lstm2_size: feed_dict.update({self.c_in_2: self.lstm2_c_eval, self.h_in_2: self.lstm2_h_eval}) if self.lstm2_size: a, vpred, nlp, self.lstm1_c_eval, self.lstm1_h_eval, self.lstm2_c_eval, self.lstm2_h_eval = \ getsess().run([self.a_samp, self.vpred, self.nlp_samp, self.c_out_1, self.h_out_1, \ self.c_out_2, self.h_out_2], feed_dict=feed_dict) else: a, vpred, nlp, self.lstm1_c_eval, self.lstm1_h_eval = \ getsess().run([self.a_samp, self.vpred, self.nlp_samp, self.c_out_1, self.h_out_1], feed_dict=feed_dict) return a[:,0], vpred[:,0], nlp[:,0]
def get_ac_value_nlp(self, ob): # ob: [None, h, w, c] # ob[:, None]: [None, None, h, w, c] a, vpred, nlp = \ getsess().run([self.a_samp, self.vpred, self.nlp_samp], feed_dict={self.ph_ob: ob[:, None]}) return a[:, 0], vpred[:, 0], nlp[:, 0]
def get_ac_value_nlp(self, ob): # ob.shape=(128,84,84,1), 在作为 feed_dict 之前增加一个维度 ob[:,None].shape=(128,1,84,84,4) a, vpred, nlp = \ getsess().run([self.a_samp, self.vpred, self.nlp_samp], feed_dict={self.ph_ob: ob[:, None]}) # 输出 a.shape = vpred.shape = nlp.shape = (128,1) return a[:, 0], vpred[:, 0], nlp[:, 0]
def get_ac_value_nlp(self, ob): feed_dict = {self.ph_ob: ob[:, None], self.c_in_1: self.lstm1_c, self.h_in_1: self.lstm1_h} if self.lstm2_size > 0: feed_dict.update({self.c_in_2: self.lstm2_c, self.h_in_2: self.lstm2_h}) if self.lstm2_size > 0: a, vpred, nlp, self.lstm1_c, self.lstm1_h, self.lstm2_c, self.lstm2_h = \ getsess().run([self.a_samp, self.vpred, self.nlp_samp, self.c_out_1, self.h_out_1, \ self.c_out_2, self.h_out_2], feed_dict=feed_dict) else: a, vpred, nlp, self.lstm1_c, self.lstm1_h = \ getsess().run([self.a_samp, self.vpred, self.nlp_samp, self.c_out_1, self.h_out_1], feed_dict=feed_dict) #print("LSTM1 c: {}".format(self.lstm1_c)) #print("LSTM1 h: {}".format(self.lstm1_h)) #print("LSTM2 c: {}".format(self.lstm2_c)) #print("LSTM2 h: {}".format(self.lstm2_h)) return a[:, 0], vpred[:, 0], nlp[:, 0]
def get_ac_value_nlp(self, ob): a, vpred, nlp, logstd, std, mean = \ getsess().run([self.a_samp, self.vpred, self.nlp_samp, self.pd_logstd, self.pd_std, self.pd_mean], feed_dict={self.ph_ob: ob[:, None]}) #print('---LOGSTD--',logstd[:,0]) #print('---STD--',std[:,0]) #print('---MEAN--',mean[:,0]) return a[:, 0], vpred[:, 0], nlp[:, 0]
def save_model(self, model_name, path_dir=None): self.saver = tf.train.Saver() if path_dir is None: path = "/tmp/" + model_name + ".ckpt" else: path = path_dir + model_name + ".ckpt" self.saver.save(getsess(), path) print("Model saved to path", path)
def restore_model(self, model_name): saver = tf.train.import_meta_graph("models/" + model_name + ".ckpt" + ".meta") saver.restore(getsess(), "models/" + model_name + ".ckpt") self.vpred = tf.get_collection("vpred")[0] self.a_samp = tf.get_collection("a_samp")[0] self.entropy = tf.get_collection("entropy")[0] self.nlp_samp = tf.get_collection("nlp_samp")[0] self.ph_ob = tf.get_collection("ph_ob")[0]
def restore_model(self, model_name, path_dir=None): if path_dir is None: path = "/tmp/" + model_name + ".ckpt" else: path = path_dir + model_name #+ ".ckpt" # self.saver = tf.train.import_meta_graph(path + ".meta") self.saver = tf.train.Saver() self.saver.restore(getsess(), path)
def calculate_loss(self, ob, last_ob, acs): n_chunks = 8 n = ob.shape[0] chunk_size = n // n_chunks assert n % n_chunks == 0 sli = lambda i: slice(i * chunk_size, (i + 1) * chunk_size) return np.concatenate([getsess().run(self.loss, {self.obs: ob[sli(i)], self.last_ob: last_ob[sli(i)], self.ac: acs[sli(i)]}) for i in range(n_chunks)], 0)
def save_model(self, model_name, ep_num): self.saver = tf.train.Saver() if not os.path.exists("models"): os.makedirs("models") if ep_num: path = "models/" + model_name + "_ep{}".format(ep_num) + ".ckpt" else: path = "models/" + model_name + "_{}".format("final") + ".ckpt" self.saver.save(getsess(), path) print("Model saved to path", path)
def calculate_loss(self, ob, last_ob, acs, feat_input, pat): n_chunks = 8 ans_buf = [] ans = [] ac_buf = [] feat_buf = [] n = ob.shape[0] chunk_size = n // n_chunks assert n % n_chunks == 0 sli = lambda i: slice(i * chunk_size, (i + 1) * chunk_size) # important last_ob: pi(s_t+1) obs:pi(s_t) ac: (s0,a0) for i in range(n_chunks): if pat: (ans, feat) = getsess().run( self.loss, { self.obs: ob[sli(i)], self.last_ob: last_ob[sli(i)], self.ac: acs[sli(i)], self.features: feat_input[sli(i)] }) else: (ans, feat) = getsess().run( self.loss, { self.obs: ob[sli(i)], self.last_ob: last_ob[sli(i)], self.ac: acs[sli(i)] }) ans_buf.append(ans) ac_buf.append(acs[sli(i)]) feat_buf.append(feat) """ if pat: return tf.stop_gradient(np.concatenate(ans_buf,0)),tf.stop_gradient(np.concatenate(ac_buf,0)), tf.stop_gradient(np.concatenate(feat_buf,0)) else: """ return np.concatenate(ans_buf, 0), np.concatenate(ac_buf, 0), np.concatenate( feat_buf, 0)
def get_ac_value_nlp_extra_input(self, ob, vel, prev_ac, prev_rew, feats=False): feed_ac = np.expand_dims(np.array(prev_ac), axis=1) #feed_ac = prev_ac feed_vel = np.expand_dims(np.array(vel), axis=1) feed_rew = np.expand_dims(np.array(prev_rew), axis=1) #feed_rew = prev_rew #print("Ac, vel, rew shapes:", feed_ac.shape, feed_vel.shape, feed_rew.shape) feed_dict = {self.ph_ob: ob[:, None], self.ph_vel: feed_vel, self.ph_prev_ac: feed_ac, self.ph_prev_rew: feed_rew, self.c_in_1: self.lstm1_c, self.h_in_1: self.lstm1_h} if self.lstm2_size > 0: feed_dict.update({self.c_in_2: self.lstm2_c, self.h_in_2: self.lstm2_h}) if self.lstm2_size > 0: if feats: out_feats, a, vpred, nlp, self.lstm1_c, self.lstm1_h, self.lstm2_c, self.lstm2_h = \ getsess().run([self.features, self.a_samp, self.vpred, self.nlp_samp, self.c_out_1, self.h_out_1, \ self.c_out_2, self.h_out_2], feed_dict=feed_dict) self.last_c_1 = self.lstm1_c self.last_h_1 = self.lstm1_h self.last_c_2 = self.lstm2_c self.last_h_2 = self.lstm2_h else: out_feats, a, vpred, nlp, self.lstm1_c, self.lstm1_h, self.lstm2_c, self.lstm2_h = \ getsess().run([self.features, self.a_samp, self.vpred, self.nlp_samp, self.c_out_1, self.h_out_1, \ self.c_out_2, self.h_out_2], feed_dict=feed_dict) else: a, vpred, nlp, self.lstm1_c, self.lstm1_h = \ getsess().run([self.features, self.a_samp, self.vpred, self.nlp_samp, self.c_out_1, self.h_out_1], feed_dict=feed_dict) #print("LSTM1 c: {}".format(self.lstm1_c)) #print("LSTM1 h: {}".format(self.lstm1_h)) #print("LSTM2 c: {}".format(self.lstm2_c)) #print("LSTM2 h: {}".format(self.lstm2_h)) if feats: return a[:, 0], vpred[:, 0], nlp[:, 0], out_feats[:,0,:] return a[:, 0], vpred[:, 0], nlp[:, 0]
def calculate_loss(self, ob, last_ob, acs): n_chunks = 8 n = ob.shape[0] chunk_size = n // n_chunks assert n % n_chunks == 0 sli = lambda i: slice(i * chunk_size, (i + 1) * chunk_size) result = [ getsess().run( [self.loss1, self.first_pred, self.first_pred_flat], { self.obs: ob[sli(i)], self.last_ob: last_ob[sli(i)], self.ac: acs[sli(i)] }) for i in range(n_chunks) ] self.buff_preds = [result[i][2] for i in range(n_chunks)] loss_total = [result[i][0] for i in range(n_chunks)] discount = self.pred_discount for p in range(self.num_preds - 1): result = [ getsess().run( [self.loss2, self.next_pred, self.next_pred_flat], { self.obs: ob[sli(i)], self.last_ob: last_ob[sli(i)], self.features: result[i - 1 - p][1], self.extracted_features: result[i - 1 - p][2] }) for i in range(1, n_chunks) ] loss2 = [result[i][0] for i in range(n_chunks - 1 - p)] avg_loss2 = np.sum(loss2, axis=0) / len(loss2) for q in range(p + 1): loss2.append(avg_loss2) loss_total = [ loss_total[i] + (discount * loss2[i]) for i in range(n_chunks) ] discount = discount * self.pred_discount return np.concatenate(loss_total, 0)
def train(self): self.agent.start_interaction(self.envs, nlump=self.hps['nlumps'], intrinsic_model=self.intrinsic_model) sess = getsess() while True: info = self.agent.step() if info['update']: logger.logkvs(info['update']) logger.dumpkvs() if self.agent.rollout.stats['tcount'] > self.num_timesteps: break self.agent.stop_interaction()
def calculate_loss(self, ob, last_ob, acs): n_chunks = 8 ans_buf = [] ans = [] ac_buf = [] ps_buf = [] n = ob.shape[0] chunk_size = n // n_chunks assert n % n_chunks == 0 sli = lambda i: slice(i * chunk_size, (i + 1) * chunk_size) # important last_ob: pi(s_t+1) obs:pi(s_t) ac: (s0,a0) for i in range(n_chunks): (ans, ps) = getsess().run(self.loss,{self.obs: ob[sli(i)], self.last_ob: last_ob[sli(i)], self.ac: acs[sli(i)]}) ans_buf.append(ans) ps_buf.append(ps) ac_buf.append(acs[sli(i)]) return np.concatenate(ans_buf,0), np.concatenate(ps_buf,0), np.concatenate(ac_buf,0) """
def calculate_reward(self, ob, last_ob, acs): """ 这个将在 rollout l-64 和 l-76 中调用, 根据实际交互过程中遇到的状态和动作来计算内在激励. init 中的self.loss定义了计算图, 这里讲真实的 ob, last_ob, acs 作为feed_dict, 返回值 obs 和 act 预测 last_obs, 计算损失. 这里分为多个 trunk 计算, 猜想是显存有限, 无法一次将批量放入 输入: ob.shape=(128,128,84,84,4), last_ob.shape=(128,1,84,84,4), acs.shape=(128,128) 输出: shape=(128,128,512) """ n_chunks = 8 n = ob.shape[0] chunk_size = n // n_chunks assert n % n_chunks == 0 sli = lambda i: slice(i * chunk_size, (i + 1) * chunk_size) return np.concatenate([ getsess().run( self.reward, { self.obs: ob[sli(i)], self.last_ob: last_ob[sli(i)], self.ac: acs[sli(i)] }) for i in range(n_chunks) ], 0)
def log_compute_rewards(self, ob, last_ob, acs, session=None): n_chunks = 8 n = ob.shape[0] chunk_size = n // n_chunks assert n % n_chunks == 0 sli = lambda i: slice(i * chunk_size, (i + 1) * chunk_size) if session is None: session = getsess() # 输出内在激励 rew_kl_np = np.concatenate([ session.run(self.reward_kl, feed_dict={ self.obs: ob[sli(i)], self.last_ob: last_ob[sli(i)], self.ac: acs[sli(i)] }) for i in range(n_chunks) ], 0) rew_elbo_np = np.concatenate([ session.run(self.reward_elbo, feed_dict={ self.obs: ob[sli(i)], self.last_ob: last_ob[sli(i)], self.ac: acs[sli(i)] }) for i in range(n_chunks) ], 0) rew_elbo_var_np = np.concatenate([ session.run(self.reward_elbo_var, feed_dict={ self.obs: ob[sli(i)], self.last_ob: last_ob[sli(i)], self.ac: acs[sli(i)] }) for i in range(n_chunks) ], 0) rew_pred_var_np = np.concatenate([ session.run(self.reward_pred_var, feed_dict={ self.obs: ob[sli(i)], self.last_ob: last_ob[sli(i)], self.ac: acs[sli(i)] }) for i in range(n_chunks) ], 0) rew_var_mean_np = np.concatenate([ session.run(self.reward_var_mean, feed_dict={ self.obs: ob[sli(i)], self.last_ob: last_ob[sli(i)], self.ac: acs[sli(i)] }) for i in range(n_chunks) ], 0) # print("Reward shape:", rew_kl_np.shape, rew_elbo_np.shape, rew_elbo_var_np.shape, rew_pred_var_np.shape) print("Reward mean: rew_kl: ", np.mean(rew_kl_np), ", rew_elbo:", np.mean(rew_elbo_np), ", rew_elbo_var:", np.mean(rew_elbo_var_np), ", rew_pred_var:", np.mean(rew_pred_var_np), ", rew_var_mean:", np.mean(rew_var_mean_np)) return np.array([ np.mean(rew_kl_np), np.mean(rew_elbo_np), np.mean(rew_elbo_var_np), np.mean(rew_pred_var_np), np.mean(rew_var_mean_np) ])
def log_train_loss(self, ob, last_ob, acs, session=None): """ 输入: ob.shape=(128,128,84,84,4), last_ob.shape=(128,1,84,84,4), acs.shape=(128,128) 输出: shape=(128,128,512) """ n_chunks = 8 n = ob.shape[0] chunk_size = n // n_chunks assert n % n_chunks == 0 sli = lambda i: slice(i * chunk_size, (i + 1) * chunk_size) if session is None: session = getsess() # 输出损失. shape=(128, 128) loss_np = np.concatenate([ session.run(self.loss, feed_dict={ self.obs: ob[sli(i)], self.last_ob: last_ob[sli(i)], self.ac: acs[sli(i)] }) for i in range(n_chunks) ], 0) # print("Train Loss: shape =", loss_np.shape, ", mean=", np.mean(loss_np)) # 输出损失中的各项 rec_loss_np = np.concatenate([ session.run(self.rec_loss, feed_dict={ self.obs: ob[sli(i)], self.last_ob: last_ob[sli(i)], self.ac: acs[sli(i)] }) for i in range(n_chunks) ], 0) kl_loss_np = np.concatenate([ session.run(self.kl_loss, feed_dict={ self.obs: ob[sli(i)], self.last_ob: last_ob[sli(i)], self.ac: acs[sli(i)] }) for i in range(n_chunks) ], 0) prior_reg_loss_np = np.concatenate([ session.run(self.prior_reg_loss, feed_dict={ self.obs: ob[sli(i)], self.last_ob: last_ob[sli(i)], self.ac: acs[sli(i)] }) for i in range(n_chunks) ], 0) # print("Train loss shape: rec_loss: ", rec_loss_np.shape, ", kl_loss shape: ", kl_loss_np.shape, ", prior_reg_loss shape: ", prior_reg_loss_np.shape) print("DVAE loss:", np.mean(loss_np), ", rec: ", np.mean(rec_loss_np), ", kl: ", np.mean(kl_loss_np), ", prior_reg: ", np.mean(prior_reg_loss_np)) return np.array([ np.mean(loss_np), np.mean(rec_loss_np), np.mean(kl_loss_np), np.mean(prior_reg_loss_np) ])
def calculate_err(self, ob, last_ob, acs): return getsess().run([self.pred_error, self.pred_features], { self.obs: ob, self.last_ob: last_ob, self.ac: acs })
def get_ac_value_nlp(self, ob, err, obpred, state=None, mask=None): a, vpred, snew, nlp = \ getsess().run([self.a_samp, self.vpred, self.snew, self.nlp_samp], feed_dict={self.ph_ob: ob[:, None], self.states_ph: state, self.masks_ph: mask[:, None], self.pred_error: err[:, None], self.obs_pred: obpred[:, None]}) return a[:, 0], vpred[:, 0], snew, nlp[:, 0]
def inference_get_ac_value_nlp(self, ob): action_scores, a, vpred, nlp = \ getsess().run([self.joe_db1, self.a_samp, self.vpred, self.nlp_samp], # feed_dict={self.ph_ob: ob[:, None]}) feed_dict={self.ph_ob: ob}) return action_scores, a[:, 0], vpred[:, 0], nlp[:, 0]
def get_ac_value_nlp(self, ob): # print("obs",ob.shape,self.ph_ob.shape) a, vpred, nlp = \ getsess().run([self.a_samp, self.vpred, self.nlp_samp], feed_dict={self.ph_ob: ob[:, None]}) return a[:, 0], vpred[:, 0], nlp[:, 0]
def get_ac_value_nlp_2vf(self, ob): a, vpred_int, vpred_ext, nlp = \ getsess().run([self.a_samp, self.vpred_int, self.vpred_ext, self.nlp_samp], feed_dict={self.ph_ob: ob[:, None]}) return a[:, 0], vpred_int[:, 0], vpred_ext[:, 0], nlp[:, 0]
def calculate_loss(self, ob, last_ob, acs, audio): if self.updates % 200 == 1: if self.updates == 1: os.system('mkdir -p ' + self.log_dir + '/checkpoints/') self.saver.save(getsess(), self.log_dir + '/checkpoints/model', global_step=self.updates) self.updates += 1 n_chunks = 8 n = ob.shape[0] chunk_size = n // n_chunks if chunk_size == 0: n_chunks = 1 chunk_size = n sli = lambda i: slice(i * chunk_size, (i + 1) * chunk_size) audio_features = self.get_audio_features(audio) if self.make_video: print("saving audio features") np.save(self.log_dir + '/audio_features', audio_features) if self.feature_space == 'joint' or self.feature_space == 'visual': losses = [ getsess().run( self.loss, { self.audio_out_features: audio_features[sli(i)], self.obs: ob[sli(i)], self.last_ob: last_ob[sli(i)], self.ac: acs[sli(i)] }) for i in range(n_chunks) ] return np.concatenate(losses, 0), None, None, None variables_to_run = [self.loss, self.tf_predictions] if self.train_discriminator: variables_to_run.append(self.discriminator_predictions) tf_outputs = [ getsess().run( variables_to_run, { self.audio_out_features: audio_features[sli(i)], self.obs: ob[sli(i)], self.last_ob: last_ob[sli(i)], self.ac: acs[sli(i)] }) for i in range(n_chunks) ] losses = np.concatenate([chunk[0] for chunk in tf_outputs]) predicted_audio_features = np.concatenate( [chunk[1] for chunk in tf_outputs]) if self.train_discriminator: discriminator_outputs = np.concatenate( [chunk[2] for chunk in tf_outputs]) else: discriminator_outputs = None prediction_audio = [] target_audio = [] for step in range(audio_features.shape[1]): # Only reconstruct for environment 0 prediction_audio.extend( self.reconstruct_audio(predicted_audio_features[0, step])) target_audio.extend(self.reconstruct_audio(audio_features[0, step])) prediction_audio = np.asarray(prediction_audio).astype(np.int16) target_audio = np.asarray(target_audio).astype(np.int16) # First term is the agent's intrinsic reward; others are used for debug video return losses, prediction_audio, target_audio, discriminator_outputs
def get_ac_value_nlp_eval(self, ob): a, vpred, nlp = getsess().run([self.a_samp, self.vpred, self.nlp_samp], feed_dict={self.ph_ob: ((ob, ), )}) return a[:, 0], vpred[:, 0], nlp[:, 0]
import tensorflow as tf from utils import getsess with tf.Session() as sess: new_saver = tf.train.import_meta_graph('./tmp/model.ckpt-0.meta') new_saver.restore(sess, tf.train.latest_checkpoint('./tmp/')) print(getsess())