def start_interaction(self, env_fns, dynamics, nlump=2):
        # 在开始与环境交互时定义变量和计算图, 初始化 rollout 类
        self.loss_names, self._losses = zip(*list(self.to_report.items()))

        # 定义损失、梯度和反向传播.  在训练时调用 sess.run(self._train) 进行迭代
        params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
        params_dvae = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope="dvae_reward")
        print("total params:", np.sum([np.prod(v.get_shape().as_list()) for v in params]))      # 6629459
        print("dvae params:", np.sum([np.prod(v.get_shape().as_list()) for v in params_dvae]))  # 2726144
        if MPI.COMM_WORLD.Get_size() > 1:
            trainer = MpiAdamOptimizer(learning_rate=self.ph_lr, comm=MPI.COMM_WORLD)
        else:
            trainer = tf.train.AdamOptimizer(learning_rate=self.ph_lr)
        gradsandvars = trainer.compute_gradients(self.total_loss, params)
        self._train = trainer.apply_gradients(gradsandvars)

        # add bai.  单独计算 DVAE 的梯度
        gradsandvars_dvae = trainer.compute_gradients(self.dynamics_loss, params_dvae)
        self._train_dvae = trainer.apply_gradients(gradsandvars_dvae)

        if MPI.COMM_WORLD.Get_rank() == 0:
            getsess().run(tf.variables_initializer(tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)))
        bcast_tf_vars_from_root(getsess(), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))

        self.all_visited_rooms = []
        self.all_scores = []
        self.nenvs = nenvs = len(env_fns)        # 默认 128
        self.nlump = nlump                       # 默认 1
        self.lump_stride = nenvs // self.nlump   # 128/1=128
        self.envs = [
            VecEnv(env_fns[l * self.lump_stride: (l + 1) * self.lump_stride], spaces=[self.ob_space, self.ac_space]) for
            l in range(self.nlump)]

        # 该类在 rollouts.py 中定义
        self.rollout = Rollout(ob_space=self.ob_space, ac_space=self.ac_space, nenvs=nenvs,
                               nsteps_per_seg=self.nsteps_per_seg,
                               nsegs_per_env=self.nsegs_per_env, nlumps=self.nlump,
                               envs=self.envs,
                               policy=self.stochpol,
                               int_rew_coeff=self.int_coeff,
                               ext_rew_coeff=self.ext_coeff,
                               record_rollouts=self.use_recorder,
                               dynamics=dynamics)

        # 环境数(线程数), 周期T
        self.buf_advs = np.zeros((nenvs, self.rollout.nsteps), np.float32)
        self.buf_rets = np.zeros((nenvs, self.rollout.nsteps), np.float32)

        if self.normrew:
            self.rff = RewardForwardFilter(self.gamma)
            self.rff_rms = RunningMeanStd()

        self.step_count = 0
        self.t_last_update = time.time()
        self.t_start = time.time()
Example #2
0
    def start_interaction(self, env_fns, dynamics, nlump=2):
        self.loss_names, self._losses = zip(*list(self.to_report.items()))

        params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
        if MPI.COMM_WORLD.Get_size() > 1:
            trainer = MpiAdamOptimizer(learning_rate=self.ph_lr,
                                       comm=MPI.COMM_WORLD)
        else:
            trainer = tf.train.AdamOptimizer(learning_rate=self.ph_lr)
        gradsandvars = trainer.compute_gradients(self.total_loss, params)
        self._train = trainer.apply_gradients(gradsandvars)

        if MPI.COMM_WORLD.Get_rank() == 0:
            getsess().run(
                tf.variables_initializer(
                    tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)))
        bcast_tf_vars_from_root(
            getsess(), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))

        self.all_visited_rooms = []
        self.all_scores = []
        self.nenvs = nenvs = len(env_fns)
        self.nlump = nlump
        self.lump_stride = nenvs // self.nlump
        self.envs = [
            VecEnv(env_fns[l * self.lump_stride:(l + 1) * self.lump_stride],
                   spaces=[self.ob_space, self.ac_space])
            for l in range(self.nlump)
        ]

        self.rollout = Rollout(hps=self.hps,
                               ob_space=self.ob_space,
                               ac_space=self.ac_space,
                               nenvs=nenvs,
                               nsteps_per_seg=self.nsteps_per_seg,
                               nsegs_per_env=self.nsegs_per_env,
                               nlumps=self.nlump,
                               envs=self.envs,
                               policy=self.stochpol,
                               int_rew_coeff=self.int_coeff,
                               ext_rew_coeff=self.ext_coeff,
                               record_rollouts=self.use_recorder,
                               dynamics=dynamics)

        self.buf_advs = np.zeros((nenvs, self.rollout.nsteps), np.float32)
        self.buf_advs_int = np.zeros((nenvs, self.rollout.nsteps), np.float32)
        self.buf_advs_ext = np.zeros((nenvs, self.rollout.nsteps), np.float32)

        self.buf_rets = np.zeros((nenvs, self.rollout.nsteps), np.float32)
        self.buf_rets_int = np.zeros((nenvs, self.rollout.nsteps), np.float32)
        self.buf_rets_ext = np.zeros((nenvs, self.rollout.nsteps), np.float32)

        if self.normrew:
            self.rff = RewardForwardFilter(self.gamma)
            self.rff_rms = RunningMeanStd()

        self.step_count = 0
        self.t_last_update = time.time()
        self.t_start = time.time()
Example #3
0
    def start_interaction(self, env_fns, dynamics, nlump=2):
        self.loss_names, self._losses = zip(*list(self.to_report.items()))
        self.global_step = tf.Variable(0, trainable=False)
        params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
        if MPI.COMM_WORLD.Get_size() > 1:
            if self.agent_num is None:
                trainer = MpiAdamOptimizer(learning_rate=self.ph_lr,
                                           comm=MPI.COMM_WORLD)

        else:
            if self.agent_num is None:
                if self.optim == 'adam':
                    trainer = tf.train.AdamOptimizer(learning_rate=self.ph_lr)
                elif self.optim == 'sgd':
                    print("using sgd")
                    print("________________________")
                    if self.decay:
                        self.decay_lr = tf.train.exponential_decay(
                            self.ph_lr,
                            self.global_step,
                            2500,
                            .96,
                            staircase=True)
                        trainer = tf.train.GradientDescentOptimizer(
                            learning_rate=self.decay_lr)
                    else:
                        trainer = tf.train.GradientDescentOptimizer(
                            learning_rate=self.ph_lr)
                elif self.optim == 'momentum':
                    print('using momentum')
                    print('________________________')
                    trainer = tf.train.MomentumOptimizer(
                        learning_rate=self.ph_lr, momentum=0.9)
        if self.agent_num is None:
            gradsandvars = trainer.compute_gradients(self.total_loss, params)
            l2_norm = lambda t: tf.sqrt(tf.reduce_sum(tf.pow(t, 2)))
            if self.log_grads:
                for grad, var in gradsandvars:
                    tf.summary.histogram(var.name + '/gradient', l2_norm(grad))
                    tf.summary.histogram(var.name + '/value', l2_norm(var))
                    grad_mean = tf.reduce_mean(tf.abs(grad))
                    tf.summary.scalar(var.name + '/grad_mean', grad_mean)
                if self.decay:
                    tf.summary.scalar('decay_lr', self.decay_lr)
                self._summary = tf.summary.merge_all()
                tf.add_to_collection("summary_op", self._summary)
            if self.grad_clip > 0:
                grads, gradvars = zip(*gradsandvars)
                grads, _ = tf.clip_by_global_norm(grads, self.grad_clip)
                gradsandvars = list(zip(grads, gradvars))

            self._train = trainer.apply_gradients(gradsandvars,
                                                  global_step=self.global_step)
            self._updates = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
            self._train = tf.group(self._train, self._updates)
            tf.add_to_collection("train_op", self._train)
        else:
            self._train = tf.get_collection("train_op")[0]
            if self.log_grads:
                self._summary = tf.get_collection("summary_op")[0]

        if MPI.COMM_WORLD.Get_rank() == 0:
            getsess().run(
                tf.variables_initializer(
                    tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)))
        bcast_tf_vars_from_root(
            getsess(), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))

        self.all_visited_rooms = []
        self.all_scores = []
        self.nenvs = nenvs = len(env_fns)
        self.nlump = nlump
        self.lump_stride = nenvs // self.nlump
        self.envs = [
            VecEnv(env_fns[l * self.lump_stride:(l + 1) * self.lump_stride],
                   spaces=[self.env_ob_space, self.ac_space])
            for l in range(self.nlump)
        ]

        self.rollout = Rollout(ob_space=self.ob_space,
                               ac_space=self.ac_space,
                               nenvs=nenvs,
                               nsteps_per_seg=self.nsteps_per_seg,
                               nsegs_per_env=self.nsegs_per_env,
                               nlumps=self.nlump,
                               envs=self.envs,
                               policy=self.stochpol,
                               int_rew_coeff=self.int_coeff,
                               ext_rew_coeff=self.ext_coeff,
                               record_rollouts=self.use_recorder,
                               dynamics=dynamics,
                               exp_name=self.exp_name,
                               env_name=self.env_name,
                               video_log_freq=self.video_log_freq,
                               model_save_freq=self.model_save_freq,
                               use_apples=self.use_apples,
                               multi_envs=self.multi_envs,
                               lstm=self.lstm,
                               lstm1_size=self.lstm1_size,
                               lstm2_size=self.lstm2_size,
                               depth_pred=self.depth_pred,
                               early_stop=self.early_stop,
                               aux_input=self.aux_input)

        self.buf_advs = np.zeros((nenvs, self.rollout.nsteps), np.float32)
        self.buf_rets = np.zeros((nenvs, self.rollout.nsteps), np.float32)

        if self.normrew:
            self.rff = RewardForwardFilter(self.gamma)
            self.rff_rms = RunningMeanStd()

        self.step_count = 0
        self.t_last_update = time.time()
        self.t_start = time.time()
    def start_interaction(self, env_fns, dynamics, nlump=2):
        self.loss_names, self._losses = zip(*list(self.to_report.items()))

        params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
        if MPI.COMM_WORLD.Get_size() > 1:
            trainer = MpiAdamOptimizer(learning_rate=self.ph_lr,
                                       comm=MPI.COMM_WORLD)
        else:
            trainer = tf.train.AdamOptimizer(learning_rate=self.ph_lr)

        #gvs = trainer.compute_gradients(self.total_loss, params)
        #self.gshape = gs
        #gs = [g for (g,v) in gvs]
        #self.normg = tf.linalg.global_norm(gs)
        #new_g = [tf.clip_by_norm(g,10.0) for g in gs i]
        #self.nnormg = tf.linalg.global_norm(new_g)
        def ClipIfNotNone(grad):
            return tf.clip_by_value(grad, -25.0,
                                    25.0) if grad is not None else grad

        gradsandvars = trainer.compute_gradients(self.total_loss, params)
        #gs = [g for (g,v) in gradsandvars]
        #new_g = [tf.clip_by_norm(g,10.0) for g in gs if g is not None]
        gradsandvars = [(ClipIfNotNone(g), v) for g, v in gradsandvars]

        #new_g = [g for (g,v) in gradsandvars]
        #self.nnormg = tf.linalg.global_norm(new_g)
        #gradsandvars = [(ClipIfNotNone(grad), var) for grad, var in gradsandvars]
        self._train = trainer.apply_gradients(gradsandvars)

        if MPI.COMM_WORLD.Get_rank() == 0:
            getsess().run(
                tf.variables_initializer(
                    tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)))
        bcast_tf_vars_from_root(
            getsess(), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES))

        self.all_visited_rooms = []
        self.all_scores = []
        self.nenvs = nenvs = len(env_fns)
        print('-------NENVS-------', self.nenvs)
        self.nlump = nlump
        print('----------NLUMPS-------', self.nlump)
        self.lump_stride = nenvs // self.nlump
        print('-------LSTRIDE----', self.lump_stride)
        print('--------OBS SPACE ---------', self.ob_space)
        print('-------------AC SPACE-----', self.ac_space)
        #assert 1==2
        print('-----BEFORE VEC ENV------')
        self.envs = [
            VecEnv(env_fns[l * self.lump_stride:(l + 1) * self.lump_stride],
                   spaces=[self.ob_space, self.ac_space])
            for l in range(self.nlump)
        ]
        print('-----AFTER VEC ENV------')
        self.rollout = Rollout(ob_space=self.ob_space,
                               ac_space=self.ac_space,
                               nenvs=nenvs,
                               nsteps_per_seg=self.nsteps_per_seg,
                               nsegs_per_env=self.nsegs_per_env,
                               nlumps=self.nlump,
                               envs=self.envs,
                               policy=self.stochpol,
                               int_rew_coeff=self.int_coeff,
                               ext_rew_coeff=self.ext_coeff,
                               record_rollouts=self.use_recorder,
                               dynamics=dynamics,
                               exp_name=self.exp_name,
                               env_name=self.env_name,
                               to_eval=self.to_eval)

        self.buf_advs = np.zeros((nenvs, self.rollout.nsteps), np.float32)
        self.buf_rets = np.zeros((nenvs, self.rollout.nsteps), np.float32)

        if self.normrew:
            self.rff = RewardForwardFilter(self.gamma)
            self.rff_rms = RunningMeanStd()

        self.step_count = 0
        self.t_last_update = time.time()
        self.t_start = time.time()
        self.saver = tf.train.Saver(max_to_keep=5)