예제 #1
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 total_timesteps,
                 nprocs=32,
                 nsteps=20,
                 nstack=4,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 vf_fisher_coef=1.0,
                 lr=0.25,
                 max_grad_norm=0.5,
                 kfac_clip=0.001,
                 lrschedule='linear'):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=nprocs,
                                inter_op_parallelism_threads=nprocs)
        config.gpu_options.allow_growth = True
        self.sess = sess = tf.Session(config=config)
        nact = ac_space.n
        nbatch = nenvs * nsteps
        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        PG_LR = tf.placeholder(tf.float32, [])
        VF_LR = tf.placeholder(tf.float32, [])

        self.model = step_model = policy(sess,
                                         ob_space,
                                         ac_space,
                                         nenvs,
                                         1,
                                         nstack,
                                         reuse=False)
        self.model2 = train_model = policy(sess,
                                           ob_space,
                                           ac_space,
                                           nenvs,
                                           nsteps,
                                           nstack,
                                           reuse=True)

        logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.pi, labels=A)
        self.logits = logits = train_model.pi

        ##training loss
        pg_loss = tf.reduce_mean(ADV * logpac)
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        pg_loss = pg_loss - ent_coef * entropy
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        train_loss = pg_loss + vf_coef * vf_loss

        ##Fisher loss construction
        self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac)
        sample_net = train_model.vf + tf.random_normal(tf.shape(
            train_model.vf))
        self.vf_fisher = vf_fisher_loss = -vf_fisher_coef * tf.reduce_mean(
            tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2))
        self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss

        self.params = params = find_trainable_variables("model")

        self.grads_check = grads = tf.gradients(train_loss, params)

        with tf.device('/gpu:0'):
            self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\
                momentum=0.9, kfac_update=1, epsilon=0.01,\
                stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm)

            update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss,
                                                            var_list=params)
            train_op, q_runner = optim.apply_gradients(list(zip(grads,
                                                                params)))
        self.q_runner = q_runner
        self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = self.lr.value()

            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                PG_LR: cur_lr
            }
            if states != []:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, train_op], td_map)
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)

        self.train = train
        self.save = save
        self.load = load
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        tf.global_variables_initializer().run(session=sess)
예제 #2
0
    def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20,
                 nstack=4, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
                 kfac_clip=0.001, lrschedule='linear'):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=nprocs,
                                inter_op_parallelism_threads=nprocs)
        config.gpu_options.allow_growth = True
        self.sess = sess = tf.Session(config=config)
        nact = ac_space.n
        nbatch = nenvs * nsteps
        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        PG_LR = tf.placeholder(tf.float32, [])
        VF_LR = tf.placeholder(tf.float32, [])

        self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
        self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True)

        logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
        self.logits = logits = train_model.pi

        ##training loss
        pg_loss = tf.reduce_mean(ADV*logpac)
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        pg_loss = pg_loss - ent_coef * entropy
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        train_loss = pg_loss + vf_coef * vf_loss


        ##Fisher loss construction
        self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac)
        sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf))
        self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2))
        self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss

        self.params=params = find_trainable_variables("model")

        self.grads_check = grads = tf.gradients(train_loss,params)

        with tf.device('/gpu:0'):
            self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\
                momentum=0.9, kfac_update=1, epsilon=0.01,\
                stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm)

            update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params)
            train_op, q_runner = optim.apply_gradients(list(zip(grads,params)))
        self.q_runner = q_runner
        self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = self.lr.value()

            td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, PG_LR:cur_lr}
            if states != []:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, train_op],
                td_map
            )
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)



        self.train = train
        self.save = save
        self.load = load
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        tf.global_variables_initializer().run(session=sess)
예제 #3
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 total_timesteps,
                 nprocs=32,
                 nscripts=16,
                 nsteps=20,
                 nstack=4,
                 ent_coef=0.1,
                 vf_coef=0.5,
                 vf_fisher_coef=1.0,
                 lr=0.25,
                 max_grad_norm=0.001,
                 kfac_clip=0.001,
                 lrschedule='linear',
                 alpha=0.99,
                 epsilon=1e-5):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=nprocs,
                                inter_op_parallelism_threads=nprocs)
        config.gpu_options.allow_growth = True
        self.sess = sess = tf.Session(config=config)
        nsml.bind(sess=sess)
        #nact = ac_space.n
        nbatch = nenvs * nsteps
        A = tf.placeholder(tf.int32, [nbatch])

        XY0 = tf.placeholder(tf.int32, [nbatch])
        XY1 = tf.placeholder(tf.int32, [nbatch])

        # ADV == TD_TARGET - values
        ADV = tf.placeholder(tf.float32, [nbatch])
        TD_TARGET = tf.placeholder(tf.float32, [nbatch])
        PG_LR = tf.placeholder(tf.float32, [])
        VF_LR = tf.placeholder(tf.float32, [])

        self.model = step_model = policy(sess,
                                         ob_space,
                                         ac_space,
                                         nenvs,
                                         1,
                                         nstack,
                                         reuse=False)
        self.model2 = train_model = policy(sess,
                                           ob_space,
                                           ac_space,
                                           nenvs,
                                           nsteps,
                                           nstack,
                                           reuse=True)

        # Policy 1 : Base Action : train_model.pi label = A

        script_mask = tf.concat([
            tf.zeros([nscripts * nsteps, 1]),
            tf.ones([(nprocs - nscripts) * nsteps, 1])
        ],
                                axis=0)

        pi = train_model.pi
        pac_weight = script_mask * (tf.nn.softmax(pi) - 1.0) + 1.0
        pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(A, depth=3), axis=1)
        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=pi,
                                                                   labels=A)
        neglogpac *= tf.stop_gradient(pac_weight)

        inv_A = 1.0 - tf.cast(A, tf.float32)

        xy0_mask = tf.cast(A, tf.float32)
        xy1_mask = tf.cast(A, tf.float32)

        condition0 = tf.equal(xy0_mask, 2)
        xy0_mask = tf.where(condition0, tf.ones(tf.shape(xy0_mask)), xy0_mask)
        xy0_mask = 1.0 - xy0_mask

        condition1 = tf.equal(xy1_mask, 2)
        xy1_mask = tf.where(condition1, tf.zeros(tf.shape(xy1_mask)), xy1_mask)

        # One hot representation of chosen marine.
        # [batch_size, 2]
        pi_xy0 = train_model.pi_xy0
        pac_weight = script_mask * (tf.nn.softmax(pi_xy0) - 1.0) + 1.0
        pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(XY0, depth=1024),
                                   axis=1)

        logpac_xy0 = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=pi_xy0, labels=XY0)
        logpac_xy0 *= tf.stop_gradient(pac_weight)
        logpac_xy0 *= tf.cast(xy0_mask, tf.float32)

        pi_xy1 = train_model.pi_xy1
        pac_weight = script_mask * (tf.nn.softmax(pi_xy1) - 1.0) + 1.0
        pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(XY0, depth=1024),
                                   axis=1)

        # 1D? 2D?
        logpac_xy1 = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=pi_xy1, labels=XY1)
        logpac_xy1 *= tf.stop_gradient(pac_weight)
        logpac_xy1 *= tf.cast(xy1_mask, tf.float32)

        pg_loss = tf.reduce_mean(ADV * neglogpac)
        pg_loss_xy0 = tf.reduce_mean(ADV * logpac_xy0)
        pg_loss_xy1 = tf.reduce_mean(ADV * logpac_xy1)

        vf_ = tf.squeeze(train_model.vf)

        vf_r = tf.concat([
            tf.ones([nscripts * nsteps, 1]),
            tf.zeros([(nprocs - nscripts) * nsteps, 1])
        ],
                         axis=0) * TD_TARGET
        vf_masked = vf_ * script_mask + vf_r

        #vf_mask[0:nscripts * nsteps] = R[0:nscripts * nsteps]

        vf_loss = tf.reduce_mean(mse(vf_masked, TD_TARGET))
        entropy_a = tf.reduce_mean(cat_entropy(train_model.pi))
        entropy_xy0 = tf.reduce_mean(cat_entropy(train_model.pi_xy0))
        entropy_xy1 = tf.reduce_mean(cat_entropy(train_model.pi_xy1))
        entropy = entropy_a + entropy_xy0 + entropy_xy1

        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, _ = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=lr,
                                            decay=alpha,
                                            epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        self.logits = logits = train_model.pi

        # xy0

        self.params_common = params_common = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/common')
        self.params_xy0 = params_xy0 = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES,
            scope='model/xy0') + params_common

        train_loss_xy0 = pg_loss_xy0 - entropy * ent_coef + vf_coef * vf_loss

        self.grads_check_xy0 = grads_xy0 = tf.gradients(
            train_loss_xy0, params_xy0)
        if max_grad_norm is not None:
            grads_xy0, _ = tf.clip_by_global_norm(grads_xy0, max_grad_norm)

        grads_xy0 = list(zip(grads_xy0, params_xy0))
        trainer_xy0 = tf.train.RMSPropOptimizer(learning_rate=lr,
                                                decay=alpha,
                                                epsilon=epsilon)
        _train_xy0 = trainer_xy0.apply_gradients(grads_xy0)

        # xy1

        self.params_xy1 = params_xy1 = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES,
            scope='model/xy1') + params_common

        train_loss_xy1 = pg_loss_xy1 - entropy * ent_coef + vf_coef * vf_loss

        self.grads_check_xy1 = grads_xy1 = tf.gradients(
            train_loss_xy1, params_xy1)
        if max_grad_norm is not None:
            grads_xy1, _ = tf.clip_by_global_norm(grads_xy1, max_grad_norm)

        grads_xy1 = list(zip(grads_xy1, params_xy1))
        trainer_xy1 = tf.train.RMSPropOptimizer(learning_rate=lr,
                                                decay=alpha,
                                                epsilon=epsilon)
        _train_xy1 = trainer_xy1.apply_gradients(grads_xy1)

        self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, td_targets, masks, actions, xy0, xy1, values):
            advs = td_targets - values
            for step in range(len(obs)):
                cur_lr = self.lr.value()

            td_map = {
                train_model.X: obs,
                A: actions,
                XY0: xy0,
                XY1: xy1,
                ADV: advs,
                TD_TARGET: td_targets,
                PG_LR: cur_lr
            }
            if states != []:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            policy_loss, value_loss, policy_entropy, _, \
            policy_loss_xy0, policy_entropy_xy0, _, \
            policy_loss_xy1, policy_entropy_xy1, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train,
                 pg_loss_xy0, entropy_xy0, _train_xy0,
                 pg_loss_xy1, entropy_xy1, _train_xy1],
                td_map)
            return policy_loss, value_loss, policy_entropy, \
                   policy_loss_xy0, policy_entropy_xy0, \
                   policy_loss_xy1, policy_entropy_xy1

        def save(save_path):
            ps = sess.run(params)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)

        self.train = train
        self.save = save
        self.load = load
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        print("global_variables_initializer start")
        tf.global_variables_initializer().run(session=sess)
        print("global_variables_initializer complete")
예제 #4
0
    def __init__(self, policy, ob_space, ac_space, nenvs,
                 expert_nbatch,
                 total_timesteps,
                 nprocs=32, nsteps=20,
                 ent_coef=0.01,
                 vf_coef=0.5, vf_fisher_coef=1.0, vf_expert_coef=0.5 * 0.0,
                 expert_coeff=1.0,
                 exp_adv_est='reward',
                 lr=0.25, max_grad_norm=0.5,
                 kfac_clip=0.001, lrschedule='linear'):

        # create tf stuff
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=nprocs,
                                inter_op_parallelism_threads=nprocs)
        config.gpu_options.allow_growth = True
        self.sess = sess = tf.Session(config=config)

        # the actual model
        nact = ac_space.n
        nbatch = nenvs * nsteps
        A = tf.placeholder(tf.int32, [nbatch])
        A_EXP = tf.placeholder(tf.int32, [expert_nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        ADV_EXP = tf.placeholder(tf.float32, [expert_nbatch])

        R = tf.placeholder(tf.float32, [nbatch])
        R_EXP = tf.placeholder(tf.float32, [expert_nbatch])

        PG_LR = tf.placeholder(tf.float32, [])

        step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
        eval_step_model = policy(sess, ob_space, ac_space, 1, 1, reuse=True)
        train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True)
        expert_train_model = policy(sess, ob_space, ac_space, expert_nbatch, 1, reuse=True)
        logpac_expert = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=expert_train_model.pi, labels=A_EXP)
        logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)

        _, acc = tf.metrics.accuracy(labels=A,
                                     predictions=tf.argmax(train_model.pi, 1))

        ## training loss
        pg_loss = tf.reduce_mean(ADV*logpac)
        pg_expert_loss = tf.reduce_mean(ADV_EXP * logpac_expert)
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        pg_loss = pg_loss - ent_coef * entropy
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        vf_expert_loss = tf.reduce_mean(mse(tf.squeeze(expert_train_model.vf), R_EXP))
        train_loss = pg_loss + vf_coef * vf_loss + expert_coeff * pg_expert_loss + vf_expert_coef * vf_expert_loss

        self.check = check = tf.add_check_numerics_ops()

        ## Fisher loss construction
        pg_fisher_loss = -tf.reduce_mean(logpac)  # + logpac_expert)
        # pg_expert_fisher_loss = -tf.reduce_mean(logpac_expert)
        sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf))
        vf_fisher_loss = - vf_fisher_coef * tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2))
        joint_fisher_loss = pg_fisher_loss + vf_fisher_loss

        params = find_trainable_variables("model")

        self.grads_check = grads = tf.gradients(train_loss, params)

        with tf.device('/gpu:0'):
            self.optim = optim = kfac.KfacOptimizer(
                learning_rate=PG_LR, clip_kl=kfac_clip,
                momentum=0.9, kfac_update=1, epsilon=0.01,
                stats_decay=0.99, async=1, cold_iter=20, max_grad_norm=max_grad_norm
            )

            # why is this unused?
            update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params)
            train_op, q_runner = optim.apply_gradients(list(zip(grads,params)))
        self.q_runner = q_runner
        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values,
                  expert_obs, expert_rewards, expert_actions, expert_values):
            if exp_adv_est == 'critic':
                expert_advs = np.clip(expert_rewards - expert_values, a_min=0, a_max=None)
            elif exp_adv_est == 'reward':
                expert_advs = expert_rewards
            elif exp_adv_est == 'simple':
                expert_advs = np.ones_like(expert_rewards)
            else:
                raise ValueError("Unknown expert advantage estimator {}".format(exp_adv_est))

            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()

            td_map = {
                train_model.X:obs,
                expert_train_model.X: expert_obs,
                A_EXP: expert_actions,
                A:actions,
                ADV:advs,
                ADV_EXP: expert_advs,
                R:rewards,
                PG_LR:cur_lr,
                R_EXP: expert_rewards
            }

            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            policy_loss, policy_expert_loss, value_loss, policy_entropy, train_accuracy, _, grads_to_check = sess.run(
                [pg_loss, pg_expert_loss, vf_loss, entropy, acc, train_op, grads],
                td_map
            )

            for grad in grads_to_check:
                if np.isnan(grad).any():
                    print("ojojoj grad is nan")

            return policy_loss, policy_expert_loss, value_loss, policy_entropy, train_accuracy

        def save(save_path):
            print("Writing model to {}".format(save_path))
            ps = sess.run(params)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)

        def eval_step(obs, eval_type):
            td_map = {eval_step_model.X: [obs]}
            logits = sess.run(eval_step_model.pi, td_map)[0]
            if eval_type == 'argmax':
                act = logits.argmax()
                if np.random.rand() < 0.01:
                    act = ac_space.sample()
                return act
            elif eval_type == 'prob':
                # probs = func(s[None, :, :, :])[0][0]
                x = logits
                e_x = np.exp(x - np.max(x))
                probs = e_x / e_x.sum(axis=0)
                act = np.random.choice(range(probs.shape[-1]), 1, p=probs)[0]
                return act
            else:
                raise ValueError("Unknown eval type {}".format(eval_type))

        self.model = step_model
        self.model2 = train_model
        self.expert_train_model = expert_train_model
        self.vf_fisher = vf_fisher_loss
        self.pg_fisher = pg_fisher_loss
        self.joint_fisher = joint_fisher_loss
        self.params = params
        self.train = train
        self.save = save
        self.load = load
        self.train_model = train_model
        self.step_model = step_model
        self.eval_step = eval_step
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        tf.global_variables_initializer().run(session=sess)
        tf.local_variables_initializer().run(session=sess)
예제 #5
0
  def __init__(self, policy, ob_space, ac_space,
               nenvs,total_timesteps, nprocs=32, nsteps=20,
               nstack=4, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0,
               lr=0.25, max_grad_norm=0.5,
               kfac_clip=0.001, lrschedule='linear'):
    config = tf.ConfigProto(allow_soft_placement=True,
                            intra_op_parallelism_threads=nprocs,
                            inter_op_parallelism_threads=nprocs)
    config.gpu_options.allow_growth = True
    self.sess = sess = tf.Session(config=config)
    #nact = ac_space.n
    nbatch = nenvs * nsteps
    A = tf.placeholder(tf.int32, [nbatch])

    SUB3 = tf.placeholder(tf.int32, [nbatch])
    SUB4 = tf.placeholder(tf.int32, [nbatch])
    SUB5 = tf.placeholder(tf.int32, [nbatch])
    SUB6 = tf.placeholder(tf.int32, [nbatch])
    SUB7 = tf.placeholder(tf.int32, [nbatch])
    SUB8 = tf.placeholder(tf.int32, [nbatch])
    SUB9 = tf.placeholder(tf.int32, [nbatch])
    SUB10 = tf.placeholder(tf.int32, [nbatch])
    SUB11 = tf.placeholder(tf.int32, [nbatch])
    SUB12 = tf.placeholder(tf.int32, [nbatch])

    X0 = tf.placeholder(tf.int32, [nbatch])
    Y0 = tf.placeholder(tf.int32, [nbatch])
    X1 = tf.placeholder(tf.int32, [nbatch])
    Y1 = tf.placeholder(tf.int32, [nbatch])
    X2 = tf.placeholder(tf.int32, [nbatch])
    Y2 = tf.placeholder(tf.int32, [nbatch])

    ADV = tf.placeholder(tf.float32, [nbatch])
    R = tf.placeholder(tf.float32, [nbatch])
    PG_LR = tf.placeholder(tf.float32, [])
    VF_LR = tf.placeholder(tf.float32, [])

    self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
    self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True)

    # Policy 1 : Base Action : train_model.pi label = A

    logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)

    logpac_sub3 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub3, labels=SUB3)
    logpac_sub4 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub4, labels=SUB4)
    logpac_sub5 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub5, labels=SUB5)
    logpac_sub6 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub6, labels=SUB6)
    logpac_sub7 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub7, labels=SUB7)
    logpac_sub8 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub8, labels=SUB8)
    logpac_sub9 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub9, labels=SUB9)
    logpac_sub10 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub10, labels=SUB10)
    logpac_sub11 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub11, labels=SUB11)
    logpac_sub12 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub12, labels=SUB12)

    logpac_x0 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_x0, labels=X0)
    logpac_y0 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_y0, labels=Y0)
    logpac_x1 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_x1, labels=X1)
    logpac_y1 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_y1, labels=Y1)
    logpac_x2 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_x2, labels=X2)
    logpac_y2 = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_y2, labels=Y2)

    self.logits = logits = train_model.pi

    ##training loss
    pg_loss = tf.reduce_mean(ADV*logpac) * tf.reduce_mean(ADV)

    pg_loss_sub3 = tf.reduce_mean(ADV*logpac_sub3) * tf.reduce_mean(ADV)
    pg_loss_sub4 = tf.reduce_mean(ADV*logpac_sub4) * tf.reduce_mean(ADV)
    pg_loss_sub5 = tf.reduce_mean(ADV*logpac_sub5) * tf.reduce_mean(ADV)
    pg_loss_sub6 = tf.reduce_mean(ADV*logpac_sub6) * tf.reduce_mean(ADV)
    pg_loss_sub7 = tf.reduce_mean(ADV*logpac_sub7) * tf.reduce_mean(ADV)
    pg_loss_sub8 = tf.reduce_mean(ADV*logpac_sub8) * tf.reduce_mean(ADV)
    pg_loss_sub9 = tf.reduce_mean(ADV*logpac_sub9) * tf.reduce_mean(ADV)
    pg_loss_sub10 = tf.reduce_mean(ADV*logpac_sub10) * tf.reduce_mean(ADV)
    pg_loss_sub11 = tf.reduce_mean(ADV*logpac_sub11) * tf.reduce_mean(ADV)
    pg_loss_sub12 = tf.reduce_mean(ADV*logpac_sub12) * tf.reduce_mean(ADV)

    pg_loss_x0 = tf.reduce_mean(ADV*logpac_x0) * tf.reduce_mean(ADV)
    pg_loss_y0 = tf.reduce_mean(ADV*logpac_y0) * tf.reduce_mean(ADV)
    pg_loss_x1 = tf.reduce_mean(ADV*logpac_x1) * tf.reduce_mean(ADV)
    pg_loss_y1 = tf.reduce_mean(ADV*logpac_y1) * tf.reduce_mean(ADV)
    pg_loss_x2 = tf.reduce_mean(ADV*logpac_x2) * tf.reduce_mean(ADV)
    pg_loss_y2 = tf.reduce_mean(ADV*logpac_y2) * tf.reduce_mean(ADV)

    entropy = tf.reduce_mean(cat_entropy(train_model.pi))

    entropy_sub3 = tf.reduce_mean(cat_entropy(train_model.pi_sub3))
    entropy_sub4 = tf.reduce_mean(cat_entropy(train_model.pi_sub4))
    entropy_sub5 = tf.reduce_mean(cat_entropy(train_model.pi_sub5))
    entropy_sub6 = tf.reduce_mean(cat_entropy(train_model.pi_sub6))
    entropy_sub7 = tf.reduce_mean(cat_entropy(train_model.pi_sub7))
    entropy_sub8 = tf.reduce_mean(cat_entropy(train_model.pi_sub8))
    entropy_sub9 = tf.reduce_mean(cat_entropy(train_model.pi_sub9))
    entropy_sub10 = tf.reduce_mean(cat_entropy(train_model.pi_sub10))
    entropy_sub11 = tf.reduce_mean(cat_entropy(train_model.pi_sub11))
    entropy_sub12 = tf.reduce_mean(cat_entropy(train_model.pi_sub12))

    entropy_x0 = tf.reduce_mean(cat_entropy(train_model.pi_x0))
    entropy_y0 = tf.reduce_mean(cat_entropy(train_model.pi_y0))
    entropy_x1 = tf.reduce_mean(cat_entropy(train_model.pi_x1))
    entropy_y1 = tf.reduce_mean(cat_entropy(train_model.pi_y1))
    entropy_x2 = tf.reduce_mean(cat_entropy(train_model.pi_x2))
    entropy_y2 = tf.reduce_mean(cat_entropy(train_model.pi_y2))

    pg_loss = pg_loss - ent_coef * entropy

    pg_loss_sub3 = pg_loss_sub3 - ent_coef * entropy_sub3
    pg_loss_sub4 = pg_loss_sub4 - ent_coef * entropy_sub4
    pg_loss_sub5 = pg_loss_sub5 - ent_coef * entropy_sub5
    pg_loss_sub6 = pg_loss_sub6 - ent_coef * entropy_sub6
    pg_loss_sub7 = pg_loss_sub7 - ent_coef * entropy_sub7
    pg_loss_sub8 = pg_loss_sub8 - ent_coef * entropy_sub8
    pg_loss_sub9 = pg_loss_sub9 - ent_coef * entropy_sub9
    pg_loss_sub10 = pg_loss_sub10 - ent_coef * entropy_sub10
    pg_loss_sub11 = pg_loss_sub11 - ent_coef * entropy_sub11
    pg_loss_sub12 = pg_loss_sub12 - ent_coef * entropy_sub12

    pg_loss_x0 = pg_loss_x0 - ent_coef * entropy_x0
    pg_loss_y0 = pg_loss_y0 - ent_coef * entropy_y0
    pg_loss_x1 = pg_loss_x1 - ent_coef * entropy_x1
    pg_loss_y1 = pg_loss_y1 - ent_coef * entropy_y1
    pg_loss_x2 = pg_loss_x2 - ent_coef * entropy_x2
    pg_loss_y2 = pg_loss_y2 - ent_coef * entropy_y2

    vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))

    self.params = params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model')

    self.params_common = params_common = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/common')

    self.params_pi1 = params_pi1 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/pi1') + params_common

    # Base Action

    train_loss = pg_loss + vf_coef * vf_loss

    ##Fisher loss construction
    self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac)
    sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf))
    self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2))
    self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss

    print("train_loss :", train_loss, " pg_fisher :", pg_fisher_loss,
          " vf_fisher :", vf_fisher_loss, " joint_fisher_loss :", joint_fisher_loss)

    self.grads_check = grads = tf.gradients(train_loss, params_pi1)

    with tf.device('/gpu:0'):
      self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,
                                              momentum=0.9, kfac_update=1, epsilon=0.01,
                                              stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm)

      update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params_pi1)
      train_op, q_runner = optim.apply_gradients(list(zip(grads, params_pi1)))

    self.q_runner = q_runner

    # sub3

    self.params_sub3 = params_sub3 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/sub3')

    train_loss_sub3 = pg_loss_sub3 + vf_coef * vf_loss

    ##Fisher loss construction
    self.pg_fisher_sub3 = pg_fisher_loss_sub3 = -tf.reduce_mean(logpac_sub3)
    self.joint_fisher_sub3 = joint_fisher_loss_sub3 = pg_fisher_loss_sub3 + vf_fisher_loss

    self.grads_check_sub3 = grads_sub3 = tf.gradients(train_loss_sub3, params_sub3)

    with tf.device('/gpu:0'):
      self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,
                                              momentum=0.9, kfac_update=1, epsilon=0.01,
                                              stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm)

      update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss_sub3, var_list=params_sub3)
      train_op_sub3, q_runner_sub3 = optim.apply_gradients(list(zip(grads_sub3, params_sub3)))

    self.q_runner_sub3 = q_runner_sub3

    # sub4

    self.params_sub4 = params_sub4 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/sub4')

    train_loss_sub4 = pg_loss_sub4 + vf_coef * vf_loss

    ##Fisher loss construction
    self.pg_fisher_sub4 = pg_fisher_loss_sub4 = -tf.reduce_mean(logpac_sub4)
    self.joint_fisher_sub4 = joint_fisher_loss_sub4 = pg_fisher_loss_sub4 + vf_fisher_loss


    self.grads_check_sub4 = grads_sub4 = tf.gradients(train_loss_sub4, params_sub4)

    with tf.device('/gpu:0'):
      self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,
                                              momentum=0.9, kfac_update=1, epsilon=0.01,
                                              stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm)

      update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss_sub4, var_list=params_sub4)
      train_op_sub4, q_runner_sub4 = optim.apply_gradients(list(zip(grads_sub4, params_sub4)))

    self.q_runner_sub4 = q_runner_sub4


    # sub5

    self.params_sub5 = params_sub5 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/sub5')

    train_loss_sub5 = pg_loss_sub5 + vf_coef * vf_loss

    ##Fisher loss construction
    self.pg_fisher_sub5 = pg_fisher_loss_sub5 = -tf.reduce_mean(logpac_sub5)
    self.joint_fisher_sub5 = joint_fisher_loss_sub5 = pg_fisher_loss_sub5 + vf_fisher_loss


    self.grads_check_sub5 = grads_sub5 = tf.gradients(train_loss_sub5, params_sub5)

    with tf.device('/gpu:0'):
      self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR,
                                              clip_kl=kfac_clip,
                                              momentum=0.9,
                                              kfac_update=1,
                                              epsilon=0.01,
                                              stats_decay=0.99,
                                              async=1,
                                              cold_iter=10,
                                              max_grad_norm=max_grad_norm)

      update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss_sub5, var_list=params_sub5)
      train_op_sub5, q_runner_sub5 = optim.apply_gradients(list(zip(grads_sub5, params_sub5)))

    self.q_runner_sub4 = q_runner_sub5

    # sub6

    self.params_sub6 = params_sub6 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/sub6')

    train_loss_sub6 = pg_loss_sub6 + vf_coef * vf_loss

    ##Fisher loss construction
    self.pg_fisher_sub6 = pg_fisher_loss_sub6 = -tf.reduce_mean(logpac_sub6)
    self.joint_fisher_sub6 = joint_fisher_loss_sub6 = pg_fisher_loss_sub6 + vf_fisher_loss


    self.grads_check_sub6 = grads_sub6 = tf.gradients(train_loss_sub6, params_sub6)

    with tf.device('/gpu:0'):
      self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,
                                              momentum=0.9, kfac_update=1, epsilon=0.01,
                                              stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm)

      update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss_sub6, var_list=params_sub6)
      train_op_sub6, q_runner_sub6 = optim.apply_gradients(list(zip(grads_sub6, params_sub6)))

    self.q_runner_sub6 = q_runner_sub6


    # sub7

    self.params_sub7 = params_sub7 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/sub7')

    train_loss_sub7 = pg_loss_sub7 + vf_coef * vf_loss

    ##Fisher loss construction
    self.pg_fisher_sub7 = pg_fisher_loss_sub7 = -tf.reduce_mean(logpac_sub7)
    self.joint_fisher_sub7 = joint_fisher_loss_sub7 = pg_fisher_loss_sub7 + vf_fisher_loss


    self.grads_check_sub7 = grads_sub7 = tf.gradients(train_loss_sub7, params_sub7)

    with tf.device('/gpu:0'):
      self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,
                                              momentum=0.9, kfac_update=1, epsilon=0.01,
                                              stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm)

      update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss_sub7, var_list=params_sub7)
      train_op_sub7, q_runner_sub7 = optim.apply_gradients(list(zip(grads_sub7, params_sub7)))

    self.q_runner_sub7 = q_runner_sub7


    # sub8

    self.params_sub8 = params_sub8 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/sub8')

    train_loss_sub8 = pg_loss_sub8 + vf_coef * vf_loss

    ##Fisher loss construction
    self.pg_fisher_sub8 = pg_fisher_loss_sub8 = -tf.reduce_mean(logpac_sub8)
    self.joint_fisher_sub8 = joint_fisher_loss_sub8 = pg_fisher_loss_sub8 + vf_fisher_loss


    self.grads_check_sub8 = grads_sub8 = tf.gradients(train_loss_sub8, params_sub8)

    with tf.device('/gpu:0'):
      self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,
                                              momentum=0.9, kfac_update=1, epsilon=0.01,
                                              stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm)

      update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss_sub8, var_list=params_sub8)
      train_op_sub8, q_runner_sub8 = optim.apply_gradients(list(zip(grads_sub8, params_sub8)))

    self.q_runner_sub8 = q_runner_sub8



    # sub9

    self.params_sub9 = params_sub9 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/sub9')

    train_loss_sub9 = pg_loss_sub9 + vf_coef * vf_loss

    ##Fisher loss construction
    self.pg_fisher_sub9 = pg_fisher_loss_sub9 = -tf.reduce_mean(logpac_sub9)
    self.joint_fisher_sub9 = joint_fisher_loss_sub9 = pg_fisher_loss_sub9 + vf_fisher_loss


    self.grads_check_sub9 = grads_sub9 = tf.gradients(train_loss_sub9, params_sub9)

    with tf.device('/gpu:0'):
      self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,
                                              momentum=0.9, kfac_update=1, epsilon=0.01,
                                              stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm)

      update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss_sub9, var_list=params_sub9)
      train_op_sub9, q_runner_sub9 = optim.apply_gradients(list(zip(grads_sub9, params_sub9)))

    self.q_runner_sub9 = q_runner_sub9


    # sub10

    self.params_sub10 = params_sub10 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/sub10')

    train_loss_sub10 = pg_loss_sub10 + vf_coef * vf_loss

    ##Fisher loss construction
    self.pg_fisher_sub10 = pg_fisher_loss_sub10 = -tf.reduce_mean(logpac_sub10)
    self.joint_fisher_sub10 = joint_fisher_loss_sub10 = pg_fisher_loss_sub10 + vf_fisher_loss


    self.grads_check_sub10 = grads_sub10 = tf.gradients(train_loss_sub10, params_sub10)

    with tf.device('/gpu:0'):
      self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,
                                              momentum=0.9, kfac_update=1, epsilon=0.01,
                                              stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm)

      update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss_sub10, var_list=params_sub10)
      train_op_sub10, q_runner_sub10 = optim.apply_gradients(list(zip(grads_sub10, params_sub10)))

    self.q_runner_sub10 = q_runner_sub10


    # sub11

    self.params_sub11 = params_sub11 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/sub11')

    train_loss_sub11 = pg_loss_sub11 + vf_coef * vf_loss

    ##Fisher loss construction
    self.pg_fisher_sub11 = pg_fisher_loss_sub11 = -tf.reduce_mean(logpac_sub11)
    self.joint_fisher_sub11 = joint_fisher_loss_sub11 = pg_fisher_loss_sub11 + vf_fisher_loss


    self.grads_check_sub11 = grads_sub11 = tf.gradients(train_loss_sub11, params_sub11)

    with tf.device('/gpu:0'):
      self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,
                                              momentum=0.9, kfac_update=1, epsilon=0.01,
                                              stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm)

      update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss_sub11, var_list=params_sub11)
      train_op_sub11, q_runner_sub11 = optim.apply_gradients(list(zip(grads_sub11, params_sub11)))

    self.q_runner_sub11 = q_runner_sub11


    # sub12

    self.params_sub12 = params_sub12 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/sub12')

    train_loss_sub12 = pg_loss_sub12 + vf_coef * vf_loss

    ##Fisher loss construction
    self.pg_fisher_sub12 = pg_fisher_loss_sub12 = -tf.reduce_mean(logpac_sub12)
    self.joint_fisher_sub12 = joint_fisher_loss_sub12 = pg_fisher_loss_sub12 + vf_fisher_loss


    self.grads_check_sub12 = grads_sub12 = tf.gradients(train_loss_sub12, params_sub12)

    with tf.device('/gpu:0'):
      self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,
                                              momentum=0.9, kfac_update=1, epsilon=0.01,
                                              stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm)

      update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss_sub12, var_list=params_sub12)
      train_op_sub12, q_runner_sub12 = optim.apply_gradients(list(zip(grads_sub12, params_sub12)))

    self.q_runner_sub12 = q_runner_sub12


    # x0

    self.params_xy0 = params_xy0 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/xy0') + params_common

    train_loss_x0 = pg_loss_x0 + vf_coef * vf_loss

    ##Fisher loss construction
    self.pg_fisher_x0 = pg_fisher_loss_x0 = -tf.reduce_mean(logpac_x0)
    self.joint_fisher_x0 = joint_fisher_loss_x0 = pg_fisher_loss_x0 + vf_fisher_loss


    self.grads_check_x0 = grads_x0 = tf.gradients(train_loss_x0, params_xy0)

    with tf.device('/gpu:0'):
      self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,
                                              momentum=0.9, kfac_update=1, epsilon=0.01,
                                              stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm)

      update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss_x0, var_list=params_xy0)
      train_op_x0, q_runner_x0 = optim.apply_gradients(list(zip(grads_x0, params_xy0)))

    self.q_runner_x0 = q_runner_x0


    # y0

    train_loss_y0 = pg_loss_y0 + vf_coef * vf_loss

    ##Fisher loss construction
    self.pg_fisher_y0 = pg_fisher_loss_y0 = -tf.reduce_mean(logpac_y0)
    self.joint_fisher_y0 = joint_fisher_loss_y0 = pg_fisher_loss_y0 + vf_fisher_loss


    self.grads_check_y0 = grads_y0 = tf.gradients(train_loss_y0, params_xy0)

    with tf.device('/gpu:0'):
      self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,
                                              momentum=0.9, kfac_update=1, epsilon=0.01,
                                              stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm)

      update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss_y0, var_list=params_xy0)
      train_op_y0, q_runner_y0 = optim.apply_gradients(list(zip(grads_y0, params_xy0)))

    self.q_runner_y0 = q_runner_y0


    # x1

    self.params_xy1 = params_xy1 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/xy1') + params_common

    train_loss_x1 = pg_loss_x1 + vf_coef * vf_loss

    ##Fisher loss construction
    self.pg_fisher_x1 = pg_fisher_loss_x1 = -tf.reduce_mean(logpac_x1)
    self.joint_fisher_x1 = joint_fisher_loss_x1 = pg_fisher_loss_x1 + vf_fisher_loss


    self.grads_check_x1 = grads_x1 = tf.gradients(train_loss_x1, params_xy1)

    with tf.device('/gpu:0'):
      self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,
                                              momentum=0.9, kfac_update=1, epsilon=0.01,
                                              stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm)

      update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss_x1, var_list=params_xy1)
      train_op_x1, q_runner_x1 = optim.apply_gradients(list(zip(grads_x1, params_xy1)))

    self.q_runner_x1 = q_runner_x1


    # y1

    train_loss_y1 = pg_loss_y1 + vf_coef * vf_loss

    ##Fisher loss construction
    self.pg_fisher_y1 = pg_fisher_loss_y1 = -tf.reduce_mean(logpac_y1)
    self.joint_fisher_y1 = joint_fisher_loss_y1 = pg_fisher_loss_y1 + vf_fisher_loss


    self.grads_check_y1 = grads_y1 = tf.gradients(train_loss_y1, params_xy1)

    with tf.device('/gpu:0'):
      self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,
                                              momentum=0.9, kfac_update=1, epsilon=0.01,
                                              stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm)

      update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss_y1, var_list=params_xy1)
      train_op_y1, q_runner_y1 = optim.apply_gradients(list(zip(grads_y1, params_xy1)))

    self.q_runner_y1 = q_runner_y1



    # x2

    self.params_xy2 = params_xy2 = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/xy2') + params_common

    train_loss_x2 = pg_loss_x2 + vf_coef * vf_loss

    ##Fisher loss construction
    self.pg_fisher_x2 = pg_fisher_loss_x2 = -tf.reduce_mean(logpac_x2)
    self.joint_fisher_x2 = joint_fisher_loss_x2 = pg_fisher_loss_x2 + vf_fisher_loss


    self.grads_check_x2 = grads_x2 = tf.gradients(train_loss_x2, params_xy2)

    with tf.device('/gpu:0'):
      self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,
                                              momentum=0.9, kfac_update=1, epsilon=0.01,
                                              stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm)

      update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss_x2, var_list=params_xy2)
      train_op_x2, q_runner_x2 = optim.apply_gradients(list(zip(grads_x2, params_xy2)))

    self.q_runner_x2 = q_runner_x2


    # y2

    train_loss_y2 = pg_loss_y2 + vf_coef * vf_loss

    ##Fisher loss construction
    self.pg_fisher_y2 = pg_fisher_loss_y2 = -tf.reduce_mean(logpac_y2)
    self.joint_fisher_y2 = joint_fisher_loss_y2 = pg_fisher_loss_y2 + vf_fisher_loss


    self.grads_check_y2 = grads_y2 = tf.gradients(train_loss_y2, params_xy2)

    with tf.device('/gpu:0'):
      self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,
                                              momentum=0.9, kfac_update=1, epsilon=0.01,
                                              stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm)

      update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss_y2, var_list=params_xy2)
      train_op_y2, q_runner_y2 = optim.apply_gradients(list(zip(grads_y2, params_xy2)))

    self.q_runner_y2 = q_runner_y2



    self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

    def train(obs, states, rewards, masks, actions,
              sub3, sub4, sub5, sub6, sub7, sub8, sub9, sub10, sub11, sub12,
              x0, y0, x1, y1, x2, y2, values):
      advs = rewards - values
      for step in range(len(obs)):
        cur_lr = self.lr.value()

      td_map = {train_model.X:obs, A:actions,
                SUB3:sub3, SUB4:sub4, SUB5:sub5, SUB6:sub6, SUB7:sub7,
                SUB8:sub8, SUB9:sub9, SUB10:sub10, SUB11:sub11, SUB12:sub12,
                X0:x0, Y0:y0, X1:x1, Y1:y1, X2:x2, Y2:y2, ADV:advs, R:rewards, PG_LR:cur_lr}
      if states != []:
        td_map[train_model.S] = states
        td_map[train_model.M] = masks

      policy_loss, value_loss, policy_entropy, _, \
      policy_loss_sub3, policy_entropy_sub3, _, \
      policy_loss_sub4, policy_entropy_sub4, _, \
      policy_loss_sub5, policy_entropy_sub5, _, \
      policy_loss_sub6, policy_entropy_sub6, _, \
      policy_loss_sub7, policy_entropy_sub7, _, \
      policy_loss_sub8, policy_entropy_sub8, _, \
      policy_loss_sub9, policy_entropy_sub9, _, \
      policy_loss_sub10, policy_entropy_sub10, _, \
      policy_loss_sub11, policy_entropy_sub11, _, \
      policy_loss_sub12, policy_entropy_sub12, _, \
      policy_loss_x0, policy_entropy_x0, _, \
      policy_loss_y0, policy_entropy_y0, _ , \
      policy_loss_x1, policy_entropy_x1, _ , \
      policy_loss_y1, policy_entropy_y1, _ , \
      policy_loss_x2, policy_entropy_x2, _ , \
      policy_loss_y2, policy_entropy_y2, _  = sess.run(
        [pg_loss, vf_loss, entropy, train_op,
         pg_loss_sub3, entropy_sub3, train_op_sub3,
         pg_loss_sub4, entropy_sub4, train_op_sub4,
         pg_loss_sub5, entropy_sub5, train_op_sub5,
         pg_loss_sub6, entropy_sub6, train_op_sub6,
         pg_loss_sub7, entropy_sub7, train_op_sub7,
         pg_loss_sub8, entropy_sub8, train_op_sub8,
         pg_loss_sub9, entropy_sub9, train_op_sub9,
         pg_loss_sub10, entropy_sub10, train_op_sub10,
         pg_loss_sub11, entropy_sub11, train_op_sub11,
         pg_loss_sub12, entropy_sub12, train_op_sub12,
         pg_loss_x0, entropy_x0, train_op_x0,
         pg_loss_y0, entropy_y0, train_op_y0,
         pg_loss_x1, entropy_x1, train_op_x1,
         pg_loss_y1, entropy_y1, train_op_y1,
         pg_loss_x2, entropy_x2, train_op_x2,
         pg_loss_y2, entropy_y2, train_op_y2],
        td_map
      )
      print("policy_loss : ", policy_loss, " value_loss : ", value_loss, " entropy : ", entropy)

      # policy_loss = 1 if(np.isinf(policy_loss)) else policy_loss
      # value_loss = 1 if(np.isinf(value_loss)) else value_loss
      # policy_entropy = 1 if(np.isinf(policy_entropy)) else policy_entropy
      #
      # policy_loss_sub3 = 1 if(np.isinf(policy_loss_sub3)) else policy_loss_sub3
      # value_loss = 1 if(np.isinf(value_loss)) else value_loss
      # policy_entropy = 1 if(np.isinf(policy_entropy)) else policy_entropy

      return policy_loss, value_loss, policy_entropy

    def save(save_path):
      ps = sess.run(params)
      joblib.dump(ps, save_path)

    def load(load_path):
      loaded_params = joblib.load(load_path)
      restores = []
      for p, loaded_p in zip(params, loaded_params):
        restores.append(p.assign(loaded_p))
      sess.run(restores)

    self.train = train
    self.save = save
    self.load = load
    self.train_model = train_model
    self.step_model = step_model
    self.step = step_model.step
    self.value = step_model.value
    self.initial_state = step_model.initial_state
    print("global_variables_initializer start")
    tf.global_variables_initializer().run(session=sess)
    print("global_variables_initializer complete")
예제 #6
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 total_timesteps,
                 nprocs=32,
                 nsteps=20,
                 nstack=4,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 vf_fisher_coef=1.0,
                 lr=0.25,
                 max_grad_norm=0.5,
                 kfac_clip=0.001,
                 lrschedule='linear'):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=nprocs,
                                inter_op_parallelism_threads=nprocs)
        config.gpu_options.allow_growth = True
        self.sess = sess = tf.Session(config=config)
        #nact = ac_space.n
        nbatch = nenvs * nsteps
        A = tf.placeholder(tf.int32, [nbatch])

        SUB3 = tf.placeholder(tf.int32, [nbatch])
        SUB4 = tf.placeholder(tf.int32, [nbatch])
        SUB5 = tf.placeholder(tf.int32, [nbatch])
        SUB6 = tf.placeholder(tf.int32, [nbatch])
        SUB7 = tf.placeholder(tf.int32, [nbatch])
        SUB8 = tf.placeholder(tf.int32, [nbatch])
        SUB9 = tf.placeholder(tf.int32, [nbatch])
        SUB10 = tf.placeholder(tf.int32, [nbatch])
        SUB11 = tf.placeholder(tf.int32, [nbatch])
        SUB12 = tf.placeholder(tf.int32, [nbatch])

        X0 = tf.placeholder(tf.int32, [nbatch])
        Y0 = tf.placeholder(tf.int32, [nbatch])
        X1 = tf.placeholder(tf.int32, [nbatch])
        Y1 = tf.placeholder(tf.int32, [nbatch])
        X2 = tf.placeholder(tf.int32, [nbatch])
        Y2 = tf.placeholder(tf.int32, [nbatch])

        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        PG_LR = tf.placeholder(tf.float32, [])
        VF_LR = tf.placeholder(tf.float32, [])

        self.model = step_model = policy(sess,
                                         ob_space,
                                         ac_space,
                                         nenvs,
                                         1,
                                         nstack,
                                         reuse=False)
        self.model2 = train_model = policy(sess,
                                           ob_space,
                                           ac_space,
                                           nenvs,
                                           nsteps,
                                           nstack,
                                           reuse=True)



        logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A) \
                 + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub3, labels=SUB3) \
                 + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub4, labels=SUB4) \
                 + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub5, labels=SUB5) \
                 + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub6, labels=SUB6) \
                 + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub7, labels=SUB7) \
                 + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub8, labels=SUB8) \
                 + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub9, labels=SUB9) \
                 + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub10, labels=SUB10) \
                 + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub11, labels=SUB11) \
                 + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_sub12, labels=SUB12) \
                 + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_x0, labels=X0) \
                 + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_y0, labels=Y0) \
                 + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_x1, labels=X1) \
                 + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_y1, labels=Y1) \
                 + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_x2, labels=X2) \
                 + tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi_y2, labels=Y2)

        self.logits = logits = train_model.pi

        ##training loss
        pg_loss = tf.reduce_mean(ADV * logpac) * tf.reduce_mean(ADV)
        entropy = tf.reduce_mean(cat_entropy(train_model.pi)) \
                  + tf.reduce_mean(cat_entropy(train_model.pi_sub3)) \
                  + tf.reduce_mean(cat_entropy(train_model.pi_sub4)) \
                  + tf.reduce_mean(cat_entropy(train_model.pi_sub5)) \
                  + tf.reduce_mean(cat_entropy(train_model.pi_sub6)) \
                  + tf.reduce_mean(cat_entropy(train_model.pi_sub7)) \
                  + tf.reduce_mean(cat_entropy(train_model.pi_sub8)) \
                  + tf.reduce_mean(cat_entropy(train_model.pi_sub9)) \
                  + tf.reduce_mean(cat_entropy(train_model.pi_sub10)) \
                  + tf.reduce_mean(cat_entropy(train_model.pi_sub11)) \
                  + tf.reduce_mean(cat_entropy(train_model.pi_sub12)) \
                  + tf.reduce_mean(cat_entropy(train_model.pi_x0)) \
                  + tf.reduce_mean(cat_entropy(train_model.pi_y0)) \
                  + tf.reduce_mean(cat_entropy(train_model.pi_x1)) \
                  + tf.reduce_mean(cat_entropy(train_model.pi_y1)) \
                  + tf.reduce_mean(cat_entropy(train_model.pi_x2)) \
                  + tf.reduce_mean(cat_entropy(train_model.pi_y2))

        pg_loss = pg_loss - ent_coef * entropy
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        train_loss = pg_loss + vf_coef * vf_loss

        ##Fisher loss construction
        self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac)
        sample_net = train_model.vf + tf.random_normal(tf.shape(
            train_model.vf))
        self.vf_fisher = vf_fisher_loss = -vf_fisher_coef * tf.reduce_mean(
            tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2))
        self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss

        self.params = params = find_trainable_variables("model")

        self.grads_check = grads = tf.gradients(train_loss, params)

        with tf.device('/gpu:0'):
            self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip, \
                                                    momentum=0.9, kfac_update=1, epsilon=0.01, \
                                                    stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm)

            update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss,
                                                            var_list=params)
            train_op, q_runner = optim.apply_gradients(list(zip(grads,
                                                                params)))
        self.q_runner = q_runner
        self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, sub3, sub4, sub5, sub6,
                  sub7, sub8, sub9, sub10, sub11, sub12, x0, y0, x1, y1, x2,
                  y2, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = self.lr.value()

            td_map = {
                train_model.X: obs,
                A: actions,
                SUB3: sub3,
                SUB4: sub4,
                SUB5: sub5,
                SUB6: sub6,
                SUB7: sub7,
                SUB8: sub8,
                SUB9: sub9,
                SUB10: sub10,
                SUB11: sub11,
                SUB12: sub12,
                X0: x0,
                Y0: y0,
                X1: x1,
                Y1: y1,
                X2: x2,
                Y2: y2,
                ADV: advs,
                R: rewards,
                PG_LR: cur_lr
            }
            if states != []:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, train_op], td_map)
            print("policy_loss : ", policy_loss, " value_loss : ", value_loss,
                  " entropy : ", entropy)

            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)

        self.train = train
        self.save = save
        self.load = load
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        print("global_variables_initializer start")
        tf.global_variables_initializer().run(session=sess)
        print("global_variables_initializer complete")