コード例 #1
0
ファイル: learner.py プロジェクト: AaronHavens/safe_rl
    def __init__(self, env, master_policy,old_master_policy, sub_policies, old_sub_policies,
                comm, clip_param=0.2, entcoeff=0, optim_epochs=10,
                optim_stepsize=3e-4, optim_batchsize=64):
        self.clip_param = clip_param
        self.entcoeff = entcoeff
        self.optim_epochs = optim_epochs
        self.optim_stepsize = optim_stepsize
        self.optim_batchsize = optim_batchsize
        self.num_subpolicies = len(sub_policies)
        self.sub_policies = sub_policies
        self.master_policy = master_policy
        ob_space = env.observation_space
        ac_space = env.action_space
        self.sp_ac = sub_policies[0].pdtype.sample_placeholder([None])
        atarg = tf.placeholder(dtype=tf.float32, shape=[None])
        ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return
        # for training theta
        # inputs for training theta
        ob = U.get_placeholder_cached(name="ob")
        ob_master = U.get_placeholder_cached(name="adv_ob")
        ac_master = master_policy.pdtype.sample_placeholder([None])
        loss_master = self.policy_loss_master(master_policy, old_master_policy, ob_master, ac_master, atarg, ret, clip_param)
        self.master_policy_var_list = master_policy.get_trainable_variables()
        self.master_loss = U.function([ob_master, ac_master, atarg, ret], U.flatgrad(loss_master, self.master_policy_var_list))
        self.master_adam = MpiAdam(self.master_policy_var_list, comm=comm)


        self.assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
            for (oldv, newv) in zipsame(old_master_policy.get_variables(), master_policy.get_variables())])


        self.assign_subs = []
        self.change_subs = []
        self.adams = []
        self.losses = []



        for i in range(self.num_subpolicies):
            varlist = sub_policies[i].get_trainable_variables()
            self.adams.append(MpiAdam(varlist))
            # loss for test
            loss = self.policy_loss(sub_policies[i], sub_policies[(i-1)%2], old_sub_policies[i], ob, self.sp_ac, atarg, ret, clip_param)
            self.losses.append(U.function([ob, self.sp_ac, atarg, ret], U.flatgrad(loss, varlist)))

            self.assign_subs.append(U.function([],[], updates=[tf.assign(oldv, newv)
                for (oldv, newv) in zipsame(old_sub_policies[i].get_variables(), sub_policies[i].get_variables())]))
            self.zerograd = U.function([], self.nograd(varlist))

        U.initialize()

        self.master_adam.sync()
        for i in range(self.num_subpolicies):
            self.adams[i].sync()
コード例 #2
0
ファイル: learner.py プロジェクト: TheCrazyT/mlsh
    def __init__(self, env, policy, old_policy, sub_policies, old_sub_policies, comm, clip_param=0.2, entcoeff=0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64):
        self.policy = policy
        self.clip_param = clip_param
        self.entcoeff = entcoeff
        self.optim_epochs = optim_epochs
        self.optim_stepsize = optim_stepsize
        self.optim_batchsize = optim_batchsize
        self.num_subpolicies = len(sub_policies)
        self.sub_policies = sub_policies
        ob_space = env.observation_space
        ac_space = env.action_space
        if WRITE_SCALAR:
            self.scalar_writer = tf.summary.FileWriter(osp.join("savedir/",'checkpoints', 'scalar%d' % time.time()))

        # for training theta
        # inputs for training theta
        ob = U.get_placeholder_cached(name="ob")
        ac = policy.pdtype.sample_placeholder([None])
        atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
        ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return
        total_loss = self.policy_loss(policy, old_policy, ob, ac, atarg, ret, clip_param)
        self.master_policy_var_list = policy.get_trainable_variables()
        self.master_loss = U.function([ob, ac, atarg, ret], U.flatgrad(total_loss, self.master_policy_var_list))
        self.master_adam = MpiAdam(self.master_policy_var_list, comm=comm)
        summ = tf.summary.scalar("total_loss", total_loss)
        self.calc_summary = U.function([ob, ac, atarg, ret],[summ])


        self.assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
            for (oldv, newv) in zipsame(old_policy.get_variables(), policy.get_variables())])

        self.assign_subs = []
        self.change_subs = []
        self.adams = []
        self.losses = []
        self.sp_ac = sub_policies[0].pdtype.sample_placeholder([None])
        for i in range(self.num_subpolicies):
            varlist = sub_policies[i].get_trainable_variables()
            self.adams.append(MpiAdam(varlist))
            # loss for test
            loss = self.policy_loss(sub_policies[i], old_sub_policies[i], ob, self.sp_ac, atarg, ret, clip_param)
            self.losses.append(U.function([ob, self.sp_ac, atarg, ret], U.flatgrad(loss, varlist)))

            self.assign_subs.append(U.function([],[], updates=[tf.assign(oldv, newv)
                for (oldv, newv) in zipsame(old_sub_policies[i].get_variables(), sub_policies[i].get_variables())]))
            self.zerograd = U.function([], self.nograd(varlist))

        U.initialize()

        self.master_adam.sync()
        for i in range(self.num_subpolicies):
            self.adams[i].sync()
コード例 #3
0
    def __init__(self, envs, policies, sub_policies, old_policies, old_sub_policies, 
            clip_param=0.2, vfcoeff=1., entcoeff=0, divcoeff=0., optim_epochs=10, 
            master_lr=1e-3, sub_lr=3e-4, optim_batchsize=64, envsperbatch=None, 
            num_rollouts=None, nlstm=256, recurrent=False):
        self.policies = policies
        self.sub_policies = sub_policies
        self.old_policies = old_policies
        self.old_sub_policies = old_sub_policies
        self.clip_param = clip_param
        self.entcoeff = entcoeff
        self.optim_epochs = optim_epochs
        self.optim_batchsize = optim_batchsize
        self.num_master_groups = num_master_groups = len(policies)
        self.num_subpolicies = num_subpolicies = len(sub_policies)
        self.ob_space = envs[0].observation_space
        self.ac_space = envs[0].action_space
        self.nbatch = nbatch = num_rollouts * envsperbatch
        self.envsperbatch = envsperbatch

        self.master_obs = [U.get_placeholder(name="master_ob_%i"%x, dtype=tf.float32,
            shape=[None] + list(self.ob_space.shape)) for x in range(num_master_groups)]
        self.master_acs = [policies[0].pdtype.sample_placeholder([None]) 
                for _ in range(num_master_groups)]
        self.master_atargs = [tf.placeholder(dtype=tf.float32, shape=[None])
                for _ in range(num_master_groups)]
        self.master_ret = [tf.placeholder(dtype=tf.float32, shape=[None])
                for _ in range(num_master_groups)]
        retvals = zip(*[self.policy_loss(policies[i], 
            old_policies[i], self.master_obs[i], self.master_acs[i], self.master_atargs[i], 
            self.master_ret[i], clip_param, mask=tf.constant(1.), vfcoeff=vfcoeff, 
            entcoeff=entcoeff) for i in range(num_master_groups)])
        self.master_losses, self.master_kl, self.master_pol_surr, self.master_vf_loss, \
                self.master_entropy, self.master_values, _ = retvals 

        master_trainers = [tf.train.AdamOptimizer(learning_rate=master_lr, 
            name='master_adam_%i'%_) for _ in range(num_master_groups)]
        master_params = [policies[i].get_trainable_variables() 
                for i in range(num_master_groups)] 
        master_grads = [tf.gradients(self.master_losses[i], master_params[i])
                for i in range(num_master_groups)]
        master_grads = [list(zip(g, p)) for g, p in zip(master_grads, master_params)]
        # TODO: gradient clipping
        self.assign_old_eq_new = [U.function([],[], updates=[tf.assign(oldv, newv)
                for (oldv, newv) in zipsame(old_policies[i].get_variables(), 
                policies[i].get_variables())]) for i in range(num_master_groups)]
        self.master_train_steps = [master_trainers[i].apply_gradients(master_grads[i])
                for i in range(num_master_groups)]
       

        if not recurrent:
            self.sub_obs = [U.get_placeholder(name="sub_ob_%i"%x, dtype=tf.float32,
                shape=[None] + list(self.ob_space.shape)) for x in range(num_subpolicies)]
        self.sub_acs = [sub_policies[0].pdtype.sample_placeholder([None]) 
                for _ in range(num_subpolicies)]
        self.sub_atargs = [tf.placeholder(dtype=tf.float32, shape=[None])
                for _ in range(num_subpolicies)]
        self.sub_ret = [tf.placeholder(dtype=tf.float32, shape=[None])
                for _ in range(num_subpolicies)]
        self.logpacs = [tf.placeholder(dtype=tf.float32, shape=[num_subpolicies, None])
                for _ in range(num_subpolicies)]
        self.loss_masks = [tf.placeholder(dtype=tf.float32, shape=[None])
                for _ in range(num_subpolicies)]
        if recurrent:
            self.sub_obs = [U.get_placeholder(name="sub_ob_%i"%x, dtype=tf.float32,
                shape=[nbatch] + list(self.ob_space.shape)) for x in range(num_subpolicies)]
            self.sub_masks = [U.get_placeholder(name="masks_%i"%_, dtype=tf.float32, 
                shape=[nbatch]) for _ in range(num_subpolicies)]
            self.sub_states = [U.get_placeholder(name="states_%i"%_, dtype=tf.float32, 
                shape=[envsperbatch, 2*nlstm]) for _ in range(num_subpolicies)]
        sub_retvals = zip(*[self.policy_loss(sub_policies[i], 
            old_sub_policies[i], self.sub_obs[i], self.sub_acs[i], self.sub_atargs[i], 
            self.sub_ret[i], clip_param, mask=self.loss_masks[i], vfcoeff=vfcoeff, 
            entcoeff=entcoeff, divcoeff=divcoeff, logpacs=None)#self.logpacs[i]) 
            for i in range(num_subpolicies)])
        self.sub_losses, self.sub_kl, self.sub_pol_surr, self.sub_vf_loss, \
                self.sub_entropy, self.sub_values, self.div_loss = sub_retvals 

        sub_trainers = [tf.train.AdamOptimizer(learning_rate=sub_lr)
                for _ in range(num_subpolicies)]
        sub_params = [sub_policies[i].get_trainable_variables() 
                for i in range(num_subpolicies)] 
        sub_grads = [tf.gradients(self.sub_losses[i], sub_params[i])
                for i in range(num_subpolicies)]
        sub_grads = [list(zip(g, p)) for g, p in zip(sub_grads, sub_params)]
        # TODO: gradient clipping
        self.subs_assign_old_eq_new = [U.function([],[], updates=[tf.assign(oldv, newv)
                for (oldv, newv) in zipsame(old_sub_policies[i].get_variables(), 
                sub_policies[i].get_variables())]) for i in range(num_subpolicies)]
        self.sub_train_steps = [sub_trainers[i].apply_gradients(sub_grads[i])
                for i in range(num_subpolicies)]

        U.initialize()
コード例 #4
0
    def __init__(self,
                 env,
                 sub_policy,
                 old_sub_policy,
                 comm,
                 clip_param=0.2,
                 entcoeff=0,
                 optim_epochs=10,
                 optim_stepsize=3e-4,
                 optim_batchsize=64,
                 args=None):
        # self.policy = policy
        self.clip_param = clip_param
        self.entcoeff = entcoeff
        self.optim_epochs = optim_epochs
        self.optim_stepsize = optim_stepsize
        self.optim_batchsize = optim_batchsize
        # self.num_subpolicies = len(sub_policies)
        self.sub_policy = sub_policy
        self.args = args
        ob_space = env.observation_space
        ac_space = env.action_space

        # for training theta
        # inputs for training theta
        ob = U.get_placeholder_cached(name="ob")
        # ac = policy.pdtype.sample_placeholder([None])
        atarg = tf.placeholder(
            dtype=tf.float32,
            shape=[None])  # Target advantage function (if applicable)
        ret = tf.placeholder(dtype=tf.float32,
                             shape=[None])  # Empirical return
        entcoeff = tf.placeholder(dtype=tf.float32, name="entcoef")
        # total_loss = self.policy_loss(policy, old_policy, ob, ac, atarg, ret, clip_param, entcoeff)
        # self.master_policy_var_list = policy.get_trainable_variables()
        # self.master_loss = U.function([ob, ac, atarg, ret, entcoeff], U.flatgrad(total_loss, self.master_policy_var_list))
        # self.master_adam = MpiAdam(self.master_policy_var_list, comm=comm)

        # self.assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
        #     for (oldv, newv) in zipsame(old_policy.get_variables(), policy.get_variables())])

        self.assign_subs = []
        self.change_subs = []
        self.adams = []
        self.losses = []
        self.sp_ac = sub_policy.pdtype.sample_placeholder([None])
        # for i in range(self.num_subpolicies):
        varlist = sub_policy.get_trainable_variables()
        self.adams.append(MpiAdam(varlist))
        # loss for test
        loss = self.policy_loss(sub_policy, old_sub_policy, ob, self.sp_ac,
                                atarg, ret, clip_param, entcoeff)
        self.losses.append(
            U.function([ob, self.sp_ac, atarg, ret, entcoeff],
                       U.flatgrad(loss, varlist)))

        self.assign_subs.append(
            U.function(
                [], [],
                updates=[
                    tf.assign(oldv, newv)
                    for (oldv, newv) in zipsame(old_sub_policy.get_variables(),
                                                sub_policy.get_variables())
                ]))
        self.zerograd = U.function([], self.nograd(varlist))

        U.initialize()

        # self.master_adam.sync()
        # for i in range(self.num_subpolicies):
        self.adams[0].sync()