Esempio n. 1
0
        def get_loss(model, placeholder_dict):
            a = placeholder_dict["A"]
            adv = placeholder_dict["ADV"]
            r = placeholder_dict["R"]
            oldneglocpac = placeholder_dict["OLDNEGLOGPAC"]
            oldvpred = placeholder_dict["OLDVPRED"]
            clip_range = placeholder_dict["CLIPRANGE"]

            neglogpac = model.pd.neglogp(a)
            entropy = tf.reduce_mean(cat_entropy(model.pi_logit))
            vpred = model.vf
            vpredclipped = oldvpred + tf.clip_by_value(model.vf - oldvpred,
                                                       -clip_range, clip_range)
            vf_losses1 = tf.square(vpred - r)
            vf_losses2 = tf.square(vpredclipped - r)
            vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
            ratio = tf.exp(oldneglocpac - neglogpac)
            pg_losses = -adv * ratio
            pg_losses2 = -adv * tf.clip_by_value(ratio, 1.0 - clip_range,
                                                 1.0 + clip_range)
            pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
            approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - oldneglocpac))
            clipfrac = tf.reduce_mean(
                tf.to_float(tf.greater(tf.abs(ratio - 1.0), clip_range)))
            return pg_loss, entropy, vf_loss, vpred, neglogpac, approxkl, clipfrac
Esempio n. 2
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 reuse=False,
                 **conv_kwargs):  # pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        #X, processed_x = observation_input(ob_space, nbatch)
        X, processed_x = observation_input(ob_space, None)
        print('X:', X.shape)
        print('processed_X:', processed_x.shape)
        print('ac_space:', ac_space.n)
        with tf.variable_scope("model", reuse=reuse):
            h = cnn_grid(processed_x, **conv_kwargs)
            actor_l1 = fc(h, 'actor', nh=64, init_scale=np.sqrt(2))
            self.phi = actor_l2 = tf.nn.tanh(actor_l1)
            #actor_l3 = fc(actor_l2, 'actor2', nh=action_space.n, init_scale=np.sqrt(2))
            critic_l1 = fc(h, 'critic', nh=64, init_scale=np.sqrt(2))
            critic_l2 = tf.nn.tanh(critic_l1)
            critic_l3 = fc(critic_l2, 'critic2', nh=1, init_scale=np.sqrt(2))
            vf0 = critic_l3
            vf = vf0[:, 0]
            self.pd, self.pi = self.pdtype.pdfromlatent(actor_l2,
                                                        init_scale=0.01)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        self.entropy = cat_entropy(self.pi)

        with tf.variable_scope("planning", reuse=reuse):
            # predict next action
            a0_onehot = tf.stop_gradient(tf.one_hot(a0, ac_space.n, axis=-1))
            f = tf.concat([self.phi, a0_onehot], axis=1)
            self.pd_p, self.pi_p = self.pdtype.pdfromlatent(f, init_scale=0.01)
            self.ap = self.pd_p.sample()

        def step(ob, *_args, **_kwargs):
            a, v, neglogp, ap = sess.run([a0, vf, neglogp0, self.ap], {X: ob})
            return a, v, ap, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X: ob})

        def neg_log_prob(actions):
            return tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=self.pi, labels=actions)

        self.X = X
        self.vf = vf
        self.step = step
        self.value = value
        self.neg_log_prob = neg_log_prob
Esempio n. 3
0
 def get_loss(model, placeholder_dict):
     a = placeholder_dict["A"]
     adv = placeholder_dict["ADV"]
     r = placeholder_dict["R"]
     # Compute cross entropy loss between estimated distribution of action and 'true' distribution of actions
     chosen_action_log_probs = tf.nn.sparse_softmax_cross_entropy_with_logits(
         logits=model.pi_logit, labels=a)
     pg_loss = tf.reduce_mean(adv * chosen_action_log_probs)  # minimize
     vf_loss = tf.reduce_mean(mse(tf.squeeze(model.vf), r))  # minimize
     entropy = -tf.reduce_mean(cat_entropy(model.pi_logit))  # maximize
     return pg_loss, entropy, vf_loss, model.vf, chosen_action_log_probs, None, None
Esempio n. 4
0
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 reuse=False,
                 **conv_kwargs):  # pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        #X, processed_x = observation_input(ob_space, nbatch)
        X, processed_x = observation_input(ob_space, None)
        print('X:', X.shape)
        print('processed_X:', processed_x.shape)
        with tf.variable_scope("model", reuse=reuse):
            h = nature_cnn(processed_x, **conv_kwargs)
            vf = fc(h, 'v', 1)[:, 0]
            self.pd, self.pi = self.pdtype.pdfromlatent(h, init_scale=0.01)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        self.entropy = cat_entropy(self.pi)

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X: ob})

        def neg_log_prob(actions):
            return tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=self.pi, labels=actions)

        self.X = X
        self.vf = vf
        self.step = step
        self.value = value
        self.neg_log_prob = neg_log_prob
Esempio n. 5
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False):  # pylint: disable=W0613
        self.pdtype = make_pdtype(ac_space)
        X, processed_x = observation_input(ob_space, None)
        #X = tf.placeholder(shape=input_shape,dtype=tf.float32, name='ob')
        with tf.variable_scope("model", reuse=reuse):
            activ = tf.tanh
            processed_x = tf.layers.flatten(processed_x)
            pi_h1 = activ(
                fc(processed_x, 'pi_fc1', nh=64, init_scale=np.sqrt(2)))
            pi_h2 = activ(fc(pi_h1, 'pi_fc2', nh=64, init_scale=np.sqrt(2)))
            vf_h1 = activ(
                fc(processed_x, 'vf_fc1', nh=64, init_scale=np.sqrt(2)))
            vf_h2 = activ(fc(vf_h1, 'vf_fc2', nh=64, init_scale=np.sqrt(2)))
            vf = fc(vf_h2, 'vf', 1)[:, 0]

            self.pd, self.pi = self.pdtype.pdfromlatent(pi_h2, init_scale=0.01)

        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None
        self.entropy = cat_entropy(self.pi)

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a0, vf, neglogp0], {X: ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X: ob})

        def neg_log_prob(actions):
            return tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=self.pi, labels=actions)

        self.X = X
        self.vf = vf
        self.step = step
        self.value = value
        self.neg_log_prob = neg_log_prob
Esempio n. 6
0
    def __init__(self,
                 state_dim,
                 n_actions,
                 n_steps,
                 vf_coeff=0.5,
                 entropy_coeff=0.001,
                 lr=0.0007,
                 lr_decay=0.99,
                 fuzz_factor=0.00005,
                 total_timesteps=800000,
                 max_grad_norm=0.5,
                 scope='actor_critic'):
        # fuzz_factor was called epsilon
        sess = tf.Session()  # TODO add CPU config information

        # Targets in loss computation
        advantage = tf.placeholder(
            dtype=tf.float32, shape=[None],
            name='advantage')  # advantage of the chosen action
        discounted_reward = tf.placeholder(
            dtype=tf.float32, shape=[None],
            name='reward')  # value function target
        action = tf.placeholder(dtype=tf.int32, shape=[None],
                                name='action_in')  # action index
        LR = tf.placeholder(dtype=tf.float32, shape=[])  # learning rate

        # target_model = SharedMLP(sess, state_dim, n_actions)  # used to predict action probs and state values
        # train_model = SharedMLP(sess, state_dim, n_actions)  #, reuse=True)
        # target_model = LSTM(sess, state_dim, n_actions, n_steps=n_steps)
        # used to predict action probs and state values
        train_model = LSTM_SM(sess, state_dim, n_actions, n_steps=n_steps)

        action_onehot = tf.one_hot(action, n_actions, dtype=tf.float32)
        chosen_action_prob = tf.reduce_sum(train_model.ap_out * action_onehot,
                                           1)

        # Compute losses: policy gradient loss (= advantage of the action), the value function loss
        # (Mean Squared error of 1-step TD-target) and an additional regulator regularizing the entropy of the policy,
        # to enhance exploration
        # vf_loss = mean(discounted_reward - estimated_value)²
        # pg_loss = mean(log(action_probs) * advantages)

        pg_loss = -tf.reduce_sum(tf.log(chosen_action_prob) * advantage)
        # action_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.ap_out, labels=action)
        # pg_loss = tf.reduce_mean(action_log_prob * advantage)

        vf_loss = tf.reduce_mean(
            tf.squared_difference(train_model.vf_out, discounted_reward) / 2.)
        entropy = tf.reduce_mean(cat_entropy(train_model.ap_out))
        loss = pg_loss + vf_coeff * vf_loss - entropy_coeff * entropy

        # Compute gradient of the expected reward w.r.t. the policy parameters
        with tf.variable_scope("model"):
            params = tf.trainable_variables()
        grads = tf.gradients(loss, params)
        # clip gradients eventually
        if max_grad_norm is not None:
            # correct way of clipping but slower than clip_by_norm
            grads, _ = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = zip(grads, params)
        # grads = list(zip(grads, params))
        optimizer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                              decay=lr_decay,
                                              epsilon=fuzz_factor)
        train_step = optimizer.apply_gradients(grads)

        _lr = LrDecay(v_init=lr, decay=lr_decay, n_step=total_timesteps)

        # training of training model
        def trainActorCritic(obs, actions, discounted_rewards, values, dones,
                             states):
            adv = discounted_rewards - values
            # enter advantage as one-hot vector
            # adv = [tf.one_hot(a, 1)*adv for a in actions]
            for i in range(len(obs)):
                lr_cur = _lr.value()
            if states is not None:  # LSTM network
                train_dict = {
                    train_model.obs_in: obs,
                    train_model.D: dones,
                    train_model.LS: states,
                    advantage: adv,
                    action: actions,
                    discounted_reward: discounted_rewards,
                    LR: lr_cur
                }
            else:  # MLP network
                train_dict = {
                    train_model.obs_in: obs,
                    advantage: adv,
                    action: actions,
                    discounted_reward: discounted_rewards,
                    LR: lr_cur
                }
            # policy_loss, value_loss, policy_entropy, _, ap, a = sess.run([pg_loss,
            #                                                               vf_loss,
            #                                                               entropy,
            #                                                               train_step,
            #                                                               train_model.ap_out,
            #                                                               train_model.a0],
            #                                                              train_dict)
            policy_loss, value_loss, policy_entropy, _, aprob = sess.run(
                [pg_loss, vf_loss, entropy, train_step, train_model.ap_out],
                train_dict)
            return policy_loss, value_loss, policy_entropy, aprob

        # def save_params():
        #
        # def load_params():

        self.train = trainActorCritic
        self.train_model = train_model
        # self.target_model = target_model
        # self.step = target_model.step
        # self.value = target_model.value
        # self.initial_states = target_model.initial_states
        self.target_model = train_model
        self.step = train_model.step
        self.value = train_model.value
        self.initial_states = train_model.initial_states
        # self.save = save_params
        # self.load = load_params
        tf.global_variables_initializer().run(session=sess)
Esempio n. 7
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 nsteps,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 mf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lrschedule='linear'):

        sess = tf_util.make_session()
        nact = ac_space.n
        nbatch = nenvs * nsteps

        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        ADV_MOMENT = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        R2 = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])
        ENT_COEF = tf.placeholder(tf.float32, [])

        step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
        train_model = policy(sess,
                             ob_space,
                             ac_space,
                             nenvs * nsteps,
                             nsteps,
                             reuse=True)

        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.pi, labels=A)
        pg_loss = tf.reduce_mean((ADV) * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        mf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.mf), R2))
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        ent_coef = Scheduler(v=ent_coef,
                             nvalues=total_timesteps / 10,
                             schedule='step')
        mf_coef = 0.01
        loss = pg_loss - entropy * ENT_COEF + vf_loss * vf_coef + mf_loss * mf_coef
        # loss = pg_loss + vf_loss * vf_coef + mf_loss * mf_coef
        # loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=alpha,
                                            epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, rewards_square, masks, actions, values,
                  moments):
            values_random = np.random.normal(
                loc=values, scale=np.sqrt(np.maximum(moments - values**2, 0)))
            # values_random = values - np.sqrt(np.maximum(moments - values ** 2,0))
            advs = rewards - values_random
            # advs = (1 - 2 * rewards) * rewards - values  + 2 * values * values
            advs_moment = rewards_square - moments
            # advs = (1 + 2 * rewards) * (rewards)
            # advs_moment = rewards_square
            for step in range(len(obs)):
                cur_lr = lr.value()
                cur_ent_coef = ent_coef.value()
            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                ADV_MOMENT: advs_moment,
                R: rewards,
                R2: rewards_square,
                LR: cur_lr,
                ENT_COEF: cur_ent_coef
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, moment_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, mf_loss, entropy, _train], td_map)
            return policy_loss, value_loss, moment_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            make_path(osp.dirname(save_path))
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
Esempio n. 8
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 nsteps,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lrschedule='linear',
                 summary_dir=None):

        sess = tf_util.make_session()
        nbatch = nenvs * nsteps

        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
        train_model = policy(sess,
                             ob_space,
                             ac_space,
                             nenvs * nsteps,
                             nsteps,
                             reuse=True)

        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.pi, labels=A)
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=alpha,
                                            epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        # storing summaries
        episode_reward = tf.placeholder("float")
        tf.summary.scalar("policy_loss", pg_loss)
        tf.summary.scalar("entropy", entropy)
        tf.summary.scalar("value_loss", vf_loss)
        tf.summary.scalar("episode_reward", episode_reward)
        summary_op = tf.summary.merge_all()

        def train(obs, states, mean_reward, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: cur_lr,
                episode_reward: mean_reward
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, summary, _ = sess.run(
                [pg_loss, vf_loss, entropy, summary_op, _train], td_map)
            return policy_loss, value_loss, policy_entropy, summary

        def save(save_path):
            ps = sess.run(params)
            make_path(osp.dirname(save_path))
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
        self.train_writer = tf.summary.FileWriter(summary_dir, sess.graph)
Esempio n. 9
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 nsteps,
                 nstack,
                 num_procs,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lrschedule='linear',
                 optimizer='adam'):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=num_procs,
                                inter_op_parallelism_threads=num_procs)
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        nbatch = nenvs * nsteps

        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        train_model = policy(sess,
                             ob_space,
                             ac_space,
                             nenvs,
                             nsteps,
                             nstack,
                             reuse=True)
        step_model = train_model

        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.pi, labels=A)
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        loss = pg_loss + vf_loss * vf_coef - entropy * ent_coef

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        if optimizer == 'adam':
            trainer = tf.train.AdamOptimizer()
        else:
            trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                                decay=alpha,
                                                epsilon=epsilon)

        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: cur_lr
            }
            if states != []:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            total_loss, policy_loss, value_loss, policy_entropy, _ = sess.run(
                [loss, pg_loss, vf_loss, entropy, _train], td_map)
            return total_loss, policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            make_path(save_path)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
Esempio n. 10
0
    def create_train_graph(self):

        opt = tf.train.AdamOptimizer(learning_rate=POLICY_LR)

        if IS_TRAINING:
            in_shape = [None, N_FRAMES * C_IN, HEIGHT, WIDTH]
        else:
            in_shape = [1, N_FRAMES * C_IN, HEIGHT, WIDTH]

        self.x = tf.placeholder(tf.float32, in_shape, "x_fake")

        self.adv_vec = tf.placeholder(tf.float32, [None, NUM_ACTIONS],
                                      "advantage_vector")

        self.y = tf.placeholder(tf.int32, [
            None,
        ])

        # self.a_idx = tf.placeholder(tf.int32, [None, ], "action_index")

        self.model = DIN(num_actions=NUM_ACTIONS, is_training=True)

        # Discriminator training graph: expert label = [1, 0], fake label = [0, 1]

        self.expert_sequence = self.read_data(N_EPISODES)

        self.x_expert = tf.reshape(self.expert_sequence,
                                   [N_STEPS, N_FRAMES * C_IN, HEIGHT, WIDTH])

        self.a_logits, self.d_logits = self.model.forward(self.x, reuse=False)

        _, self.d_logits_ex = self.model.forward(self.x_expert, reuse=True)

        d_expert_loss = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=tf.zeros(N_STEPS, tf.int32), logits=self.d_logits_ex))

        d_fake_loss = tf.reduce_mean(
            tf.nn.sparse_softmax_cross_entropy_with_logits(
                labels=self.y, logits=self.d_logits))

        neglogpac = tf.nn.softmax_cross_entropy_with_logits(
            logits=self.a_logits, labels=self.adv_vec)

        self.pg_loss = BETA_PG * tf.reduce_mean(neglogpac)

        self.entropy = BETA_ENT * tf.reduce_mean(
            utils.cat_entropy(self.a_logits))

        self.d_loss = BETA_DISC * (d_fake_loss + d_expert_loss)

        train_vars = tf.trainable_variables()

        self.grad_norm_d = utils.gradient_norm(self.d_loss, train_vars)

        self.grad_norm_p = utils.gradient_norm(self.pg_loss, train_vars)

        self.grad_norm_ent = utils.gradient_norm(self.entropy, train_vars)

        loss = self.pg_loss - self.entropy + self.d_loss

        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

        with tf.control_dependencies(update_ops):

            gvs = opt.compute_gradients(loss)

            capped_gvs = [(tf.clip_by_value(grad, -MAX_GRAD, MAX_GRAD), var)
                          for grad, var in gvs]

            self.g_norm, self.w_norm = utils.compute_mean_abs_norm(capped_gvs)

            self.grad_op = opt.apply_gradients(capped_gvs)