def start_interaction(self, env_fns, dynamics, nlump=2): self.loss_names, self._losses = zip(*list(self.to_report.items())) params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) if MPI.COMM_WORLD.Get_size() > 1: trainer = MpiAdamOptimizer(learning_rate=self.ph_lr, comm=MPI.COMM_WORLD) else: trainer = tf.train.AdamOptimizer(learning_rate=self.ph_lr) gradsandvars = trainer.compute_gradients(self.total_loss, params) self._train = trainer.apply_gradients(gradsandvars) if MPI.COMM_WORLD.Get_rank() == 0: getsess().run( tf.variables_initializer( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))) bcast_tf_vars_from_root( getsess(), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)) self.all_visited_rooms = [] self.all_scores = [] self.nenvs = nenvs = len(env_fns) self.nlump = nlump self.lump_stride = nenvs // self.nlump self.envs = [ VecEnv(env_fns[l * self.lump_stride:(l + 1) * self.lump_stride], spaces=[self.ob_space, self.ac_space]) for l in range(self.nlump) ] self.rollout = Rollout(hps=self.hps, ob_space=self.ob_space, ac_space=self.ac_space, nenvs=nenvs, nsteps_per_seg=self.nsteps_per_seg, nsegs_per_env=self.nsegs_per_env, nlumps=self.nlump, envs=self.envs, policy=self.stochpol, int_rew_coeff=self.int_coeff, ext_rew_coeff=self.ext_coeff, record_rollouts=self.use_recorder, dynamics=dynamics) self.buf_advs = np.zeros((nenvs, self.rollout.nsteps), np.float32) self.buf_advs_int = np.zeros((nenvs, self.rollout.nsteps), np.float32) self.buf_advs_ext = np.zeros((nenvs, self.rollout.nsteps), np.float32) self.buf_rets = np.zeros((nenvs, self.rollout.nsteps), np.float32) self.buf_rets_int = np.zeros((nenvs, self.rollout.nsteps), np.float32) self.buf_rets_ext = np.zeros((nenvs, self.rollout.nsteps), np.float32) if self.normrew: self.rff = RewardForwardFilter(self.gamma) self.rff_rms = RunningMeanStd() self.step_count = 0 self.t_last_update = time.time() self.t_start = time.time()
def start_interaction(self, env_fns, dynamics, nlump=2): # 在开始与环境交互时定义变量和计算图, 初始化 rollout 类 self.loss_names, self._losses = zip(*list(self.to_report.items())) # 定义损失、梯度和反向传播. 在训练时调用 sess.run(self._train) 进行迭代 params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) params_dvae = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="dvae_reward") print("total params:", np.sum([np.prod(v.get_shape().as_list()) for v in params])) # 6629459 print("dvae params:", np.sum([np.prod(v.get_shape().as_list()) for v in params_dvae])) # 2726144 if MPI.COMM_WORLD.Get_size() > 1: trainer = MpiAdamOptimizer(learning_rate=self.ph_lr, comm=MPI.COMM_WORLD) else: trainer = tf.train.AdamOptimizer(learning_rate=self.ph_lr) gradsandvars = trainer.compute_gradients(self.total_loss, params) self._train = trainer.apply_gradients(gradsandvars) # add bai. 单独计算 DVAE 的梯度 gradsandvars_dvae = trainer.compute_gradients(self.dynamics_loss, params_dvae) self._train_dvae = trainer.apply_gradients(gradsandvars_dvae) if MPI.COMM_WORLD.Get_rank() == 0: getsess().run(tf.variables_initializer(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))) bcast_tf_vars_from_root(getsess(), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)) self.all_visited_rooms = [] self.all_scores = [] self.nenvs = nenvs = len(env_fns) # 默认 128 self.nlump = nlump # 默认 1 self.lump_stride = nenvs // self.nlump # 128/1=128 self.envs = [ VecEnv(env_fns[l * self.lump_stride: (l + 1) * self.lump_stride], spaces=[self.ob_space, self.ac_space]) for l in range(self.nlump)] # 该类在 rollouts.py 中定义 self.rollout = Rollout(ob_space=self.ob_space, ac_space=self.ac_space, nenvs=nenvs, nsteps_per_seg=self.nsteps_per_seg, nsegs_per_env=self.nsegs_per_env, nlumps=self.nlump, envs=self.envs, policy=self.stochpol, int_rew_coeff=self.int_coeff, ext_rew_coeff=self.ext_coeff, record_rollouts=self.use_recorder, dynamics=dynamics) # 环境数(线程数), 周期T self.buf_advs = np.zeros((nenvs, self.rollout.nsteps), np.float32) self.buf_rets = np.zeros((nenvs, self.rollout.nsteps), np.float32) if self.normrew: self.rff = RewardForwardFilter(self.gamma) self.rff_rms = RunningMeanStd() self.step_count = 0 self.t_last_update = time.time() self.t_start = time.time()
def start_interaction(self, env_fns, dynamics, nlump=2): self.loss_names, self._losses = zip(*list(self.to_report.items())) self.global_step = tf.Variable(0, trainable=False) params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) if MPI.COMM_WORLD.Get_size() > 1: if self.agent_num is None: trainer = MpiAdamOptimizer(learning_rate=self.ph_lr, comm=MPI.COMM_WORLD) else: if self.agent_num is None: if self.optim == 'adam': trainer = tf.train.AdamOptimizer(learning_rate=self.ph_lr) elif self.optim == 'sgd': print("using sgd") print("________________________") if self.decay: self.decay_lr = tf.train.exponential_decay( self.ph_lr, self.global_step, 2500, .96, staircase=True) trainer = tf.train.GradientDescentOptimizer( learning_rate=self.decay_lr) else: trainer = tf.train.GradientDescentOptimizer( learning_rate=self.ph_lr) elif self.optim == 'momentum': print('using momentum') print('________________________') trainer = tf.train.MomentumOptimizer( learning_rate=self.ph_lr, momentum=0.9) if self.agent_num is None: gradsandvars = trainer.compute_gradients(self.total_loss, params) l2_norm = lambda t: tf.sqrt(tf.reduce_sum(tf.pow(t, 2))) if self.log_grads: for grad, var in gradsandvars: tf.summary.histogram(var.name + '/gradient', l2_norm(grad)) tf.summary.histogram(var.name + '/value', l2_norm(var)) grad_mean = tf.reduce_mean(tf.abs(grad)) tf.summary.scalar(var.name + '/grad_mean', grad_mean) if self.decay: tf.summary.scalar('decay_lr', self.decay_lr) self._summary = tf.summary.merge_all() tf.add_to_collection("summary_op", self._summary) if self.grad_clip > 0: grads, gradvars = zip(*gradsandvars) grads, _ = tf.clip_by_global_norm(grads, self.grad_clip) gradsandvars = list(zip(grads, gradvars)) self._train = trainer.apply_gradients(gradsandvars, global_step=self.global_step) self._updates = tf.get_collection(tf.GraphKeys.UPDATE_OPS) self._train = tf.group(self._train, self._updates) tf.add_to_collection("train_op", self._train) else: self._train = tf.get_collection("train_op")[0] if self.log_grads: self._summary = tf.get_collection("summary_op")[0] if MPI.COMM_WORLD.Get_rank() == 0: getsess().run( tf.variables_initializer( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))) bcast_tf_vars_from_root( getsess(), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)) self.all_visited_rooms = [] self.all_scores = [] self.nenvs = nenvs = len(env_fns) self.nlump = nlump self.lump_stride = nenvs // self.nlump self.envs = [ VecEnv(env_fns[l * self.lump_stride:(l + 1) * self.lump_stride], spaces=[self.env_ob_space, self.ac_space]) for l in range(self.nlump) ] self.rollout = Rollout(ob_space=self.ob_space, ac_space=self.ac_space, nenvs=nenvs, nsteps_per_seg=self.nsteps_per_seg, nsegs_per_env=self.nsegs_per_env, nlumps=self.nlump, envs=self.envs, policy=self.stochpol, int_rew_coeff=self.int_coeff, ext_rew_coeff=self.ext_coeff, record_rollouts=self.use_recorder, dynamics=dynamics, exp_name=self.exp_name, env_name=self.env_name, video_log_freq=self.video_log_freq, model_save_freq=self.model_save_freq, use_apples=self.use_apples, multi_envs=self.multi_envs, lstm=self.lstm, lstm1_size=self.lstm1_size, lstm2_size=self.lstm2_size, depth_pred=self.depth_pred, early_stop=self.early_stop, aux_input=self.aux_input) self.buf_advs = np.zeros((nenvs, self.rollout.nsteps), np.float32) self.buf_rets = np.zeros((nenvs, self.rollout.nsteps), np.float32) if self.normrew: self.rff = RewardForwardFilter(self.gamma) self.rff_rms = RunningMeanStd() self.step_count = 0 self.t_last_update = time.time() self.t_start = time.time()
def start_interaction(self, env_fns, dynamics, nlump=2): self.loss_names, self._losses = zip(*list(self.to_report.items())) params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES) if MPI.COMM_WORLD.Get_size() > 1: trainer = MpiAdamOptimizer(learning_rate=self.ph_lr, comm=MPI.COMM_WORLD) else: trainer = tf.train.AdamOptimizer(learning_rate=self.ph_lr) #gvs = trainer.compute_gradients(self.total_loss, params) #self.gshape = gs #gs = [g for (g,v) in gvs] #self.normg = tf.linalg.global_norm(gs) #new_g = [tf.clip_by_norm(g,10.0) for g in gs i] #self.nnormg = tf.linalg.global_norm(new_g) def ClipIfNotNone(grad): return tf.clip_by_value(grad, -25.0, 25.0) if grad is not None else grad gradsandvars = trainer.compute_gradients(self.total_loss, params) #gs = [g for (g,v) in gradsandvars] #new_g = [tf.clip_by_norm(g,10.0) for g in gs if g is not None] gradsandvars = [(ClipIfNotNone(g), v) for g, v in gradsandvars] #new_g = [g for (g,v) in gradsandvars] #self.nnormg = tf.linalg.global_norm(new_g) #gradsandvars = [(ClipIfNotNone(grad), var) for grad, var in gradsandvars] self._train = trainer.apply_gradients(gradsandvars) if MPI.COMM_WORLD.Get_rank() == 0: getsess().run( tf.variables_initializer( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))) bcast_tf_vars_from_root( getsess(), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)) self.all_visited_rooms = [] self.all_scores = [] self.nenvs = nenvs = len(env_fns) print('-------NENVS-------', self.nenvs) self.nlump = nlump print('----------NLUMPS-------', self.nlump) self.lump_stride = nenvs // self.nlump print('-------LSTRIDE----', self.lump_stride) print('--------OBS SPACE ---------', self.ob_space) print('-------------AC SPACE-----', self.ac_space) #assert 1==2 print('-----BEFORE VEC ENV------') self.envs = [ VecEnv(env_fns[l * self.lump_stride:(l + 1) * self.lump_stride], spaces=[self.ob_space, self.ac_space]) for l in range(self.nlump) ] print('-----AFTER VEC ENV------') self.rollout = Rollout(ob_space=self.ob_space, ac_space=self.ac_space, nenvs=nenvs, nsteps_per_seg=self.nsteps_per_seg, nsegs_per_env=self.nsegs_per_env, nlumps=self.nlump, envs=self.envs, policy=self.stochpol, int_rew_coeff=self.int_coeff, ext_rew_coeff=self.ext_coeff, record_rollouts=self.use_recorder, dynamics=dynamics, exp_name=self.exp_name, env_name=self.env_name, to_eval=self.to_eval) self.buf_advs = np.zeros((nenvs, self.rollout.nsteps), np.float32) self.buf_rets = np.zeros((nenvs, self.rollout.nsteps), np.float32) if self.normrew: self.rff = RewardForwardFilter(self.gamma) self.rff_rms = RunningMeanStd() self.step_count = 0 self.t_last_update = time.time() self.t_start = time.time() self.saver = tf.train.Saver(max_to_keep=5)