Python Scheduler Examples

Programming Language: Python

Namespace/Package Name: rl.acktr.utils

Class/Type: Scheduler

Examples at hotexamples.com: 8

Python Scheduler - 8 examples found. These are the top rated real world Python examples of rl.acktr.utils.Scheduler extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Scheduler(5)

value(4)

Frequently Used Methods

Scheduler (5)

value (4)

Example #1

Show file

File: kfac_discriminator_wgan_gp.py Project: kvas7andy/multiagent-gail_wsjeon

    def __init__(self,
                 sess,
                 ob_spaces,
                 ac_spaces,
                 nstack,
                 index,
                 disc_type='decentralized',
                 hidden_size=128,
                 lr_rate=0.01,
                 total_steps=50000,
                 scope="discriminator",
                 kfac_clip=0.001,
                 max_grad_norm=0.5):
        self.lr = Scheduler(v=lr_rate, nvalues=total_steps, schedule='linear')
        self.disc_type = disc_type
        if disc_type not in disc_types:
            assert False
        self.scope = scope
        self.index = index
        self.sess = sess
        ob_space = ob_spaces[index]
        ac_space = ac_spaces[index]
        self.ob_shape = ob_space.shape[0] * nstack
        nact = ac_space.n
        self.ac_shape = nact * nstack
        self.all_ob_shape = sum([obs.shape[0] for obs in ob_spaces]) * nstack
        self.all_ac_shape = sum([ac.n for ac in ac_spaces]) * nstack
        self.hidden_size = hidden_size

        if disc_type == 'decentralized':
            input_shape = self.ob_shape + self.ac_shape
        elif disc_type == 'centralized':
            input_shape = self.all_ob_shape + self.all_ac_shape
        elif disc_type == 'single':
            input_shape = self.all_ob_shape + self.all_ac_shape
        else:
            assert False

        self.g = tf.placeholder(tf.float32, (None, input_shape))
        self.e = tf.placeholder(tf.float32, (None, input_shape))
        self.lr_rate = tf.placeholder(tf.float32, ())
        self.adv = tf.placeholder(tf.float32, ())

        num_outputs = len(ob_spaces) if disc_type == 'centralized' else 1

        logits = self.build_graph(tf.concat([self.g, self.e], axis=0),
                                  num_outputs,
                                  reuse=False)
        labels = tf.concat([
            tf.ones([tf.shape(self.g)[0], 1]),
            -tf.ones([tf.shape(self.e)[0], 1])
        ],
                           axis=0)

        g_logits = self.build_graph(self.g, num_outputs, reuse=True)
        e_logits = self.build_graph(self.e, num_outputs, reuse=True)

        self.g_loss = tf.reduce_mean(g_logits)
        self.e_loss = tf.reduce_mean(-e_logits)

        # self.g_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
        #     logits=g_logits, labels=tf.zeros_like(g_logits)))
        # self.e_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
        #     logits=e_logits, labels=tf.ones_like(e_logits)))

        self.total_loss = logits * labels  # tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels))

        epsilon = tf.random_uniform([], 0.0, 1.0)
        ge = self.g * epsilon + self.e * (1 - epsilon)
        gel = self.build_graph(ge, num_outputs, reuse=True)
        ddd = tf.gradients(gel, [ge])
        ddd = tf.norm(ddd, axis=1)
        self.ddd = tf.reduce_mean(tf.square(ddd - 1.)) * 5

        sample_net = logits + tf.random_normal(tf.shape(logits))
        fisher_loss = -tf.reduce_mean(
            tf.pow(logits - tf.stop_gradient(sample_net), 2))

        self.reward_op = tf.sigmoid(g_logits)
        # self.reward_op = tf.nn.sigmoid_cross_entropy_with_logits(logits=g_logits, labels=tf.zeros_like(g_logits))

        self.var_list = self.get_trainable_variables()
        params = find_trainable_variables(self.scope)
        grads = tf.gradients(self.total_loss, params)

        # self.d_optim = tf.train.AdamOptimizer(self.lr_rate, beta1=0.5, beta2=0.9).minimize(self.total_loss, var_list=self.var_list)
        with tf.variable_scope(self.scope + '/d_optim'):
            d_optim = kfac.KfacOptimizer(learning_rate=self.lr_rate,
                                         clip_kl=kfac_clip,
                                         momentum=0.9,
                                         kfac_update=1,
                                         epsilon=0.01,
                                         stats_decay=0.99,
                                         async=0,
                                         cold_iter=10,
                                         max_grad_norm=max_grad_norm)
            update_stats_op = d_optim.compute_and_apply_stats(fisher_loss,
                                                              var_list=params)
            train_op, q_runner = d_optim.apply_gradients(
                list(zip(grads, params)))
            self.q_runner = q_runner

        self.g_optim = tf.train.AdamOptimizer(learning_rate=0.0005).minimize(
            self.ddd)
        self.d_optim = train_op
        self.saver = tf.train.Saver(self.get_variables())

        self.params_flat = self.get_trainable_variables()

Example #2

Show file

File: kfac_discriminator_airl.py Project: codailiclr2020/CoDAIL

    def __init__(self,
                 sess,
                 ob_spaces,
                 ac_spaces,
                 state_only,
                 discount,
                 nstack,
                 index,
                 disc_type='decentralized',
                 hidden_size=128,
                 lr_rate=0.01,
                 total_steps=50000,
                 scope="discriminator",
                 kfac_clip=0.001,
                 max_grad_norm=0.5,
                 l2_loss_ratio=0.01):
        self.lr = Scheduler(v=lr_rate, nvalues=total_steps, schedule='linear')
        self.disc_type = disc_type
        self.l2_loss_ratio = l2_loss_ratio
        if disc_type not in disc_types:
            assert False
        self.state_only = state_only
        self.gamma = discount
        self.scope = scope
        self.index = index
        self.sess = sess
        ob_space = ob_spaces[index]
        ac_space = ac_spaces[index]
        self.ob_shape = ob_space.shape[0] * nstack
        self.all_ob_shape = sum([obs.shape[0] for obs in ob_spaces]) * nstack
        try:
            nact = ac_space.n
        except:
            nact = ac_space.shape[0]
        self.ac_shape = nact * nstack
        self.all_ob_shape = sum([obs.shape[0] for obs in ob_spaces]) * nstack
        try:
            self.all_ac_shape = sum([ac.n for ac in ac_spaces]) * nstack
        except:
            self.all_ac_shape = sum([ac.shape[0] for ac in ac_spaces]) * nstack
        self.hidden_size = hidden_size

        if disc_type == 'decentralized':
            self.obs = tf.placeholder(tf.float32, (None, self.ob_shape))
            self.nobs = tf.placeholder(tf.float32, (None, self.ob_shape))
            self.act = tf.placeholder(tf.float32, (None, self.ac_shape))
            self.labels = tf.placeholder(tf.float32, (None, 1))
            self.lprobs = tf.placeholder(tf.float32, (None, 1))
        elif disc_type == 'decentralized-all':
            self.obs = tf.placeholder(tf.float32, (None, self.all_ob_shape))
            self.nobs = tf.placeholder(tf.float32, (None, self.all_ob_shape))
            self.act = tf.placeholder(tf.float32, (None, self.all_ac_shape))
            self.labels = tf.placeholder(tf.float32, (None, 1))
            self.lprobs = tf.placeholder(tf.float32, (None, 1))
        else:
            assert False

        self.lr_rate = tf.placeholder(tf.float32, ())

        with tf.variable_scope(self.scope):
            rew_input = self.obs
            if not self.state_only:
                rew_input = tf.concat([self.obs, self.act], axis=1)

            with tf.variable_scope('reward'):
                self.reward = self.relu_net(rew_input, dout=1)
                # self.reward = self.tanh_net(rew_input, dout=1)

            with tf.variable_scope('vfn'):
                self.value_fn_n = self.relu_net(self.nobs, dout=1)
                # self.value_fn_n = self.tanh_net(self.nobs, dout=1)
            with tf.variable_scope('vfn', reuse=True):
                self.value_fn = self.relu_net(self.obs, dout=1)
                # self.value_fn = self.tanh_net(self.obs, dout=1)

            log_q_tau = self.lprobs
            log_p_tau = self.reward + self.gamma * self.value_fn_n - self.value_fn
            log_pq = tf.reduce_logsumexp([log_p_tau, log_q_tau], axis=0)
            self.discrim_output = tf.exp(log_p_tau - log_pq)

        self.total_loss = -tf.reduce_mean(self.labels * (log_p_tau - log_pq) +
                                          (1 - self.labels) *
                                          (log_q_tau - log_pq))
        self.var_list = self.get_trainable_variables()
        params = find_trainable_variables(self.scope)
        self.l2_loss = tf.add_n([tf.nn.l2_loss(v)
                                 for v in params]) * self.l2_loss_ratio
        self.total_loss += self.l2_loss

        grads = tf.gradients(self.total_loss, params)
        # fisher_loss = -self.total_loss
        # self.d_optim = tf.train.AdamOptimizer(self.lr_rate, beta1=0.5, beta2=0.9).minimize(self.total_loss, var_list=self.var_list)
        with tf.variable_scope(self.scope + '/d_optim'):
            # d_optim = kfac.KfacOptimizer(
            #     learning_rate=self.lr_rate, clip_kl=kfac_clip,
            #     momentum=0.9, kfac_update=1, epsilon=0.01,
            #     stats_decay=0.99, async=0, cold_iter=10,
            #     max_grad_norm=max_grad_norm)
            # update_stats_op = d_optim.compute_and_apply_stats(fisher_loss, var_list=params)
            # train_op, q_runner = d_optim.apply_gradients(list(zip(grads, params)))
            # self.q_runner = q_runner
            d_optim = tf.train.AdamOptimizer(learning_rate=self.lr_rate)
            train_op = d_optim.apply_gradients(list(zip(grads, params)))
        self.d_optim = train_op
        self.saver = tf.train.Saver(self.get_variables())

        self.params_flat = self.get_trainable_variables()

Example #3

Show file

File: kfac_discriminator_wgan_gp.py Project: kvas7andy/multiagent-gail_wsjeon

class Discriminator(object):
    def __init__(self,
                 sess,
                 ob_spaces,
                 ac_spaces,
                 nstack,
                 index,
                 disc_type='decentralized',
                 hidden_size=128,
                 lr_rate=0.01,
                 total_steps=50000,
                 scope="discriminator",
                 kfac_clip=0.001,
                 max_grad_norm=0.5):
        self.lr = Scheduler(v=lr_rate, nvalues=total_steps, schedule='linear')
        self.disc_type = disc_type
        if disc_type not in disc_types:
            assert False
        self.scope = scope
        self.index = index
        self.sess = sess
        ob_space = ob_spaces[index]
        ac_space = ac_spaces[index]
        self.ob_shape = ob_space.shape[0] * nstack
        nact = ac_space.n
        self.ac_shape = nact * nstack
        self.all_ob_shape = sum([obs.shape[0] for obs in ob_spaces]) * nstack
        self.all_ac_shape = sum([ac.n for ac in ac_spaces]) * nstack
        self.hidden_size = hidden_size

        if disc_type == 'decentralized':
            input_shape = self.ob_shape + self.ac_shape
        elif disc_type == 'centralized':
            input_shape = self.all_ob_shape + self.all_ac_shape
        elif disc_type == 'single':
            input_shape = self.all_ob_shape + self.all_ac_shape
        else:
            assert False

        self.g = tf.placeholder(tf.float32, (None, input_shape))
        self.e = tf.placeholder(tf.float32, (None, input_shape))
        self.lr_rate = tf.placeholder(tf.float32, ())
        self.adv = tf.placeholder(tf.float32, ())

        num_outputs = len(ob_spaces) if disc_type == 'centralized' else 1

        logits = self.build_graph(tf.concat([self.g, self.e], axis=0),
                                  num_outputs,
                                  reuse=False)
        labels = tf.concat([
            tf.ones([tf.shape(self.g)[0], 1]),
            -tf.ones([tf.shape(self.e)[0], 1])
        ],
                           axis=0)

        g_logits = self.build_graph(self.g, num_outputs, reuse=True)
        e_logits = self.build_graph(self.e, num_outputs, reuse=True)

        self.g_loss = tf.reduce_mean(g_logits)
        self.e_loss = tf.reduce_mean(-e_logits)

        # self.g_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
        #     logits=g_logits, labels=tf.zeros_like(g_logits)))
        # self.e_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(
        #     logits=e_logits, labels=tf.ones_like(e_logits)))

        self.total_loss = logits * labels  # tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels))

        epsilon = tf.random_uniform([], 0.0, 1.0)
        ge = self.g * epsilon + self.e * (1 - epsilon)
        gel = self.build_graph(ge, num_outputs, reuse=True)
        ddd = tf.gradients(gel, [ge])
        ddd = tf.norm(ddd, axis=1)
        self.ddd = tf.reduce_mean(tf.square(ddd - 1.)) * 5

        sample_net = logits + tf.random_normal(tf.shape(logits))
        fisher_loss = -tf.reduce_mean(
            tf.pow(logits - tf.stop_gradient(sample_net), 2))

        self.reward_op = tf.sigmoid(g_logits)
        # self.reward_op = tf.nn.sigmoid_cross_entropy_with_logits(logits=g_logits, labels=tf.zeros_like(g_logits))

        self.var_list = self.get_trainable_variables()
        params = find_trainable_variables(self.scope)
        grads = tf.gradients(self.total_loss, params)

        # self.d_optim = tf.train.AdamOptimizer(self.lr_rate, beta1=0.5, beta2=0.9).minimize(self.total_loss, var_list=self.var_list)
        with tf.variable_scope(self.scope + '/d_optim'):
            d_optim = kfac.KfacOptimizer(learning_rate=self.lr_rate,
                                         clip_kl=kfac_clip,
                                         momentum=0.9,
                                         kfac_update=1,
                                         epsilon=0.01,
                                         stats_decay=0.99,
                                         async=0,
                                         cold_iter=10,
                                         max_grad_norm=max_grad_norm)
            update_stats_op = d_optim.compute_and_apply_stats(fisher_loss,
                                                              var_list=params)
            train_op, q_runner = d_optim.apply_gradients(
                list(zip(grads, params)))
            self.q_runner = q_runner

        self.g_optim = tf.train.AdamOptimizer(learning_rate=0.0005).minimize(
            self.ddd)
        self.d_optim = train_op
        self.saver = tf.train.Saver(self.get_variables())

        self.params_flat = self.get_trainable_variables()
        # self.clip = [tf.assign(v, tf.clip_by_value(v, -0.05, 0.05)) for v in self.get_trainable_variables()]
        # self.clip = tf.group(*self.clip)

    def build_graph(self, x, num_outputs=1, reuse=False):
        with tf.variable_scope(self.scope):
            if reuse:
                tf.get_variable_scope().reuse_variables()
            p_h1 = fc(x, 'fc1', nh=self.hidden_size)
            p_h2 = fc(p_h1, 'fc2', nh=self.hidden_size)
            logits = fc(p_h2, 'out', nh=num_outputs, act=lambda x: x)
        return logits

    def get_variables(self):
        return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)

    def get_trainable_variables(self):
        return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)

    def get_reward(self, obs, acs):
        if len(obs.shape) == 1:
            obs = np.expand_dims(obs, 0)
        if len(acs.shape) == 1:
            acs = np.expand_dims(acs, 0)
        feed_dict = {self.g: np.concatenate([obs, acs], axis=1)}
        return self.sess.run(self.reward_op, feed_dict)

    def train(self, g_obs, g_acs, e_obs, e_acs):
        feed_dict = {
            self.g: np.concatenate([g_obs, g_acs], axis=1),
            self.e: np.concatenate([e_obs, e_acs], axis=1),
            self.lr_rate: self.lr.value()
        }
        loss, _ = self.sess.run([self.total_loss, self.d_optim], feed_dict)
        for _ in range(5):
            self.sess.run(self.g_optim, feed_dict)
        g_loss, e_loss = self.sess.run([self.g_loss, self.e_loss], feed_dict)
        return g_loss, e_loss, None, None

    def restore(self, path):
        print('restoring from:' + path)
        self.saver.restore(self.sess, path)

    def save(self, save_path):
        ps = self.sess.run(self.params_flat)
        joblib.dump(ps, save_path)

    def load(self, load_path):
        loaded_params = joblib.load(load_path)
        restores = []
        for p, loaded_p in zip(self.params_flat, loaded_params):
            restores.append(p.assign(loaded_p))
        self.sess.run(restores)

Example #4

Show file

File: kfac_discriminator_airl.py Project: codailiclr2020/CoDAIL

class Discriminator(object):
    def __init__(self,
                 sess,
                 ob_spaces,
                 ac_spaces,
                 state_only,
                 discount,
                 nstack,
                 index,
                 disc_type='decentralized',
                 hidden_size=128,
                 lr_rate=0.01,
                 total_steps=50000,
                 scope="discriminator",
                 kfac_clip=0.001,
                 max_grad_norm=0.5,
                 l2_loss_ratio=0.01):
        self.lr = Scheduler(v=lr_rate, nvalues=total_steps, schedule='linear')
        self.disc_type = disc_type
        self.l2_loss_ratio = l2_loss_ratio
        if disc_type not in disc_types:
            assert False
        self.state_only = state_only
        self.gamma = discount
        self.scope = scope
        self.index = index
        self.sess = sess
        ob_space = ob_spaces[index]
        ac_space = ac_spaces[index]
        self.ob_shape = ob_space.shape[0] * nstack
        self.all_ob_shape = sum([obs.shape[0] for obs in ob_spaces]) * nstack
        try:
            nact = ac_space.n
        except:
            nact = ac_space.shape[0]
        self.ac_shape = nact * nstack
        self.all_ob_shape = sum([obs.shape[0] for obs in ob_spaces]) * nstack
        try:
            self.all_ac_shape = sum([ac.n for ac in ac_spaces]) * nstack
        except:
            self.all_ac_shape = sum([ac.shape[0] for ac in ac_spaces]) * nstack
        self.hidden_size = hidden_size

        if disc_type == 'decentralized':
            self.obs = tf.placeholder(tf.float32, (None, self.ob_shape))
            self.nobs = tf.placeholder(tf.float32, (None, self.ob_shape))
            self.act = tf.placeholder(tf.float32, (None, self.ac_shape))
            self.labels = tf.placeholder(tf.float32, (None, 1))
            self.lprobs = tf.placeholder(tf.float32, (None, 1))
        elif disc_type == 'decentralized-all':
            self.obs = tf.placeholder(tf.float32, (None, self.all_ob_shape))
            self.nobs = tf.placeholder(tf.float32, (None, self.all_ob_shape))
            self.act = tf.placeholder(tf.float32, (None, self.all_ac_shape))
            self.labels = tf.placeholder(tf.float32, (None, 1))
            self.lprobs = tf.placeholder(tf.float32, (None, 1))
        else:
            assert False

        self.lr_rate = tf.placeholder(tf.float32, ())

        with tf.variable_scope(self.scope):
            rew_input = self.obs
            if not self.state_only:
                rew_input = tf.concat([self.obs, self.act], axis=1)

            with tf.variable_scope('reward'):
                self.reward = self.relu_net(rew_input, dout=1)
                # self.reward = self.tanh_net(rew_input, dout=1)

            with tf.variable_scope('vfn'):
                self.value_fn_n = self.relu_net(self.nobs, dout=1)
                # self.value_fn_n = self.tanh_net(self.nobs, dout=1)
            with tf.variable_scope('vfn', reuse=True):
                self.value_fn = self.relu_net(self.obs, dout=1)
                # self.value_fn = self.tanh_net(self.obs, dout=1)

            log_q_tau = self.lprobs
            log_p_tau = self.reward + self.gamma * self.value_fn_n - self.value_fn
            log_pq = tf.reduce_logsumexp([log_p_tau, log_q_tau], axis=0)
            self.discrim_output = tf.exp(log_p_tau - log_pq)

        self.total_loss = -tf.reduce_mean(self.labels * (log_p_tau - log_pq) +
                                          (1 - self.labels) *
                                          (log_q_tau - log_pq))
        self.var_list = self.get_trainable_variables()
        params = find_trainable_variables(self.scope)
        self.l2_loss = tf.add_n([tf.nn.l2_loss(v)
                                 for v in params]) * self.l2_loss_ratio
        self.total_loss += self.l2_loss

        grads = tf.gradients(self.total_loss, params)
        # fisher_loss = -self.total_loss
        # self.d_optim = tf.train.AdamOptimizer(self.lr_rate, beta1=0.5, beta2=0.9).minimize(self.total_loss, var_list=self.var_list)
        with tf.variable_scope(self.scope + '/d_optim'):
            # d_optim = kfac.KfacOptimizer(
            #     learning_rate=self.lr_rate, clip_kl=kfac_clip,
            #     momentum=0.9, kfac_update=1, epsilon=0.01,
            #     stats_decay=0.99, async=0, cold_iter=10,
            #     max_grad_norm=max_grad_norm)
            # update_stats_op = d_optim.compute_and_apply_stats(fisher_loss, var_list=params)
            # train_op, q_runner = d_optim.apply_gradients(list(zip(grads, params)))
            # self.q_runner = q_runner
            d_optim = tf.train.AdamOptimizer(learning_rate=self.lr_rate)
            train_op = d_optim.apply_gradients(list(zip(grads, params)))
        self.d_optim = train_op
        self.saver = tf.train.Saver(self.get_variables())

        self.params_flat = self.get_trainable_variables()

    def relu_net(self, x, layers=2, dout=1, hidden_size=128):
        out = x
        for i in range(layers):
            out = relu_layer(out, dout=hidden_size, name='l%d' % i)
        out = linear(out, dout=dout, name='lfinal')
        return out

    def tanh_net(self, x, layers=2, dout=1, hidden_size=128):
        out = x
        for i in range(layers):
            out = tanh_layer(out, dout=hidden_size, name='l%d' % i)
        out = linear(out, dout=dout, name='lfinal')
        return out

    def get_variables(self):
        return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)

    def get_trainable_variables(self):
        return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)

    def get_reward(self, obs, acs, obs_next, path_probs, discrim_score=False):
        if len(obs.shape) == 1:
            obs = np.expand_dims(obs, 0)
        if len(acs.shape) == 1:
            acs = np.expand_dims(acs, 0)
        if discrim_score:
            feed_dict = {
                self.obs: obs,
                self.act: acs,
                self.nobs: obs_next,
                self.lprobs: path_probs
            }
            scores = self.sess.run(self.discrim_output, feed_dict)
            score = np.log(scores + 1e-20) - np.log(1 - scores + 1e-20)
        else:
            feed_dict = {self.obs: obs, self.act: acs}
            score = self.sess.run(self.reward, feed_dict)
        return score

    def train(self, g_obs, g_acs, g_nobs, g_probs, e_obs, e_acs, e_nobs,
              e_probs):
        labels = np.concatenate(
            (np.zeros([g_obs.shape[0], 1]), np.ones([e_obs.shape[0], 1])),
            axis=0)
        feed_dict = {
            self.obs: np.concatenate([g_obs, e_obs], axis=0),
            self.act: np.concatenate([g_acs, e_acs], axis=0),
            self.nobs: np.concatenate([g_nobs, e_nobs], axis=0),
            self.lprobs: np.concatenate([g_probs, e_probs], axis=0),
            self.labels: labels,
            self.lr_rate: self.lr.value()
        }
        loss, _ = self.sess.run([self.total_loss, self.d_optim], feed_dict)
        return loss

    def restore(self, path):
        print('restoring from:' + path)
        self.saver.restore(self.sess, path)

    def save(self, save_path):
        ps = self.sess.run(self.params_flat)
        joblib.dump(ps, save_path)

    def load(self, load_path):
        loaded_params = joblib.load(load_path)
        restores = []
        for p, loaded_p in zip(self.params_flat, loaded_params):
            restores.append(p.assign(loaded_p))
        self.sess.run(restores)

Example #5

Show file

File: kfac_discriminator_ncdail.py Project: codailiclr2020/CoDAIL

    def __init__(self,
                 sess,
                 ob_spaces,
                 ac_spaces,
                 nstack,
                 index,
                 disc_type='decentralized',
                 hidden_size=128,
                 lr_rate=0.01,
                 total_steps=50000,
                 scope="discriminator",
                 kfac_clip=0.001,
                 max_grad_norm=0.5):
        self.lr = Scheduler(v=lr_rate, nvalues=total_steps, schedule='linear')
        self.disc_type = disc_type
        if disc_type not in disc_types:
            assert False
        self.scope = scope
        self.index = index
        self.sess = sess
        ob_space = ob_spaces[index]
        ac_space = ac_spaces[index]
        self.ob_shape = ob_space.shape[0] * nstack
        try:
            nact = ac_space.n
        except:
            nact = ac_space.shape[0]
        self.ac_shape = nact * nstack
        self.all_ob_shape = sum([obs.shape[0] for obs in ob_spaces]) * nstack
        try:
            self.all_ac_shape = sum([ac.n for ac in ac_spaces]) * nstack
        except:
            self.all_ac_shape = sum([ac.shape[0] for ac in ac_spaces]) * nstack
        self.hidden_size = hidden_size

        if disc_type == 'decentralized':
            input_shape = self.ob_shape + self.all_ac_shape
        elif disc_type == 'decentralized-all':
            input_shape = self.all_ob_shape + self.all_ac_shape
        else:
            assert False

        self.g = tf.placeholder(tf.float32, (None, input_shape))
        self.e = tf.placeholder(tf.float32, (None, input_shape))
        self.lr_rate = tf.placeholder(tf.float32, ())
        self.adv = tf.placeholder(tf.float32, ())

        num_outputs = 1

        logits = self.build_graph(tf.concat([self.g, self.e], axis=0),
                                  num_outputs,
                                  reuse=False)
        labels = tf.concat([
            tf.zeros([tf.shape(self.g)[0], 1]),
            tf.ones([tf.shape(self.e)[0], 1])
        ],
                           axis=0)

        g_logits = self.build_graph(self.g, num_outputs, reuse=True)
        e_logits = self.build_graph(self.e, num_outputs, reuse=True)

        self.g_loss = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(
                logits=g_logits, labels=tf.zeros_like(g_logits)))
        self.e_loss = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(
                logits=e_logits, labels=tf.ones_like(e_logits)))

        self.total_loss = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(logits=logits,
                                                    labels=labels))
        fisher_loss = -self.total_loss

        # self.reward_op = tf.sigmoid(g_logits) * 2.0 - 1
        # self.reward_op = tf.log(tf.sigmoid(g_logits) + 1e-10)
        # self.reward_op = tf.nn.sigmoid_cross_entropy_with_logits(logits=g_logits, labels=tf.zeros_like(g_logits))
        self.reward_op = tf.log(tf.sigmoid(g_logits) +
                                1e-10) - tf.log(1 - tf.sigmoid(g_logits) +
                                                1e-10)

        self.var_list = self.get_trainable_variables()
        params = find_trainable_variables(self.scope)
        grads = tf.gradients(self.total_loss, params)

        # self.d_optim = tf.train.AdamOptimizer(self.lr_rate, beta1=0.5, beta2=0.9).minimize(self.total_loss, var_list=self.var_list)
        with tf.variable_scope(self.scope + '/d_optim'):
            # d_optim = kfac.KfacOptimizer(
            #     learning_rate=self.lr_rate, clip_kl=kfac_clip,
            #     momentum=0.9, kfac_update=1, epsilon=0.01,
            #     stats_decay=0.99, async=0, cold_iter=10,
            #     max_grad_norm=max_grad_norm)
            # update_stats_op = d_optim.compute_and_apply_stats(fisher_loss, var_list=params)
            # train_op, q_runner = d_optim.apply_gradients(list(zip(grads, params)))
            # self.q_runner = q_runner
            d_optim = tf.train.AdamOptimizer(learning_rate=self.lr_rate)
            train_op = d_optim.apply_gradients(list(zip(grads, params)))

        self.d_optim = train_op
        self.saver = tf.train.Saver(self.get_variables())

        self.params_flat = self.get_trainable_variables()

Example #6

Show file

    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 total_timesteps,
                 nprocs=2,
                 nsteps=200,
                 nstack=1,
                 ent_coef=0.00,
                 vf_coef=0.5,
                 vf_fisher_coef=1.0,
                 lr=0.25,
                 max_grad_norm=0.5,
                 kfac_clip=0.001,
                 lrschedule='linear',
                 identical=None):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=nprocs,
                                inter_op_parallelism_threads=nprocs)
        config.gpu_options.allow_growth = True
        self.sess = sess = tf.Session(config=config)
        nbatch = nenvs * nsteps
        self.num_agents = num_agents = len(ob_space)
        self.n_actions = [ac_space[k].n for k in range(self.num_agents)]
        if identical is None:
            identical = [False for _ in range(self.num_agents)]

        scale = [1 for _ in range(num_agents)]
        pointer = [i for i in range(num_agents)]
        h = 0
        for k in range(num_agents):
            if identical[k]:
                scale[h] += 1
            else:
                pointer[h] = k
                h = k
        pointer[h] = num_agents

        A, ADV, R, PG_LR = [], [], [], []
        for k in range(num_agents):
            if identical[k]:
                A.append(A[-1])
                ADV.append(ADV[-1])
                R.append(R[-1])
                PG_LR.append(PG_LR[-1])
            else:
                A.append(tf.placeholder(tf.int32, [nbatch * scale[k]]))
                ADV.append(tf.placeholder(tf.float32, [nbatch * scale[k]]))
                R.append(tf.placeholder(tf.float32, [nbatch * scale[k]]))
                PG_LR.append(tf.placeholder(tf.float32, []))

        pg_loss, entropy, vf_loss, train_loss = [], [], [], []
        self.model = step_model = []
        self.model2 = train_model = []
        self.pg_fisher = pg_fisher_loss = []
        self.logits = logits = []
        sample_net = []
        self.vf_fisher = vf_fisher_loss = []
        self.joint_fisher = joint_fisher_loss = []
        self.lld = lld = []
        self.log_pac = []

        for k in range(num_agents):
            if identical[k]:
                step_model.append(step_model[-1])
                train_model.append(train_model[-1])
            else:
                step_model.append(
                    policy(sess,
                           ob_space[k],
                           ac_space[k],
                           ob_space,
                           ac_space,
                           nenvs,
                           1,
                           nstack,
                           reuse=False,
                           name='%d' % k))
                train_model.append(
                    policy(sess,
                           ob_space[k],
                           ac_space[k],
                           ob_space,
                           ac_space,
                           nenvs * scale[k],
                           nsteps,
                           nstack,
                           reuse=True,
                           name='%d' % k))
            logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=train_model[k].pi, labels=A[k])
            self.log_pac.append(-logpac)

            lld.append(tf.reduce_mean(logpac))
            logits.append(train_model[k].pi)

            pg_loss.append(tf.reduce_mean(ADV[k] * logpac))
            entropy.append(tf.reduce_mean(cat_entropy(train_model[k].pi)))
            pg_loss[k] = pg_loss[k] - ent_coef * entropy[k]
            vf_loss.append(
                tf.reduce_mean(mse(tf.squeeze(train_model[k].vf), R[k])))
            train_loss.append(pg_loss[k] + vf_coef * vf_loss[k])

            pg_fisher_loss.append(-tf.reduce_mean(logpac))
            sample_net.append(train_model[k].vf +
                              tf.random_normal(tf.shape(train_model[k].vf)))
            vf_fisher_loss.append(-vf_fisher_coef * tf.reduce_mean(
                tf.pow(train_model[k].vf - tf.stop_gradient(sample_net[k]),
                       2)))
            joint_fisher_loss.append(pg_fisher_loss[k] + vf_fisher_loss[k])

        self.policy_params = []
        self.value_params = []

        for k in range(num_agents):
            if identical[k]:
                self.policy_params.append(self.policy_params[-1])
                self.value_params.append(self.value_params[-1])
            else:
                self.policy_params.append(
                    find_trainable_variables("policy_%d" % k))
                self.value_params.append(
                    find_trainable_variables("value_%d" % k))
        self.params = params = [
            a + b for a, b in zip(self.policy_params, self.value_params)
        ]
        params_flat = []
        for k in range(num_agents):
            params_flat.extend(params[k])

        self.grads_check = grads = [
            tf.gradients(train_loss[k], params[k]) for k in range(num_agents)
        ]
        clone_grads = [
            tf.gradients(lld[k], params[k]) for k in range(num_agents)
        ]

        self.optim = optim = []
        self.clones = clones = []
        update_stats_op = []
        train_op, clone_op, q_runner = [], [], []

        for k in range(num_agents):
            if identical[k]:
                optim.append(optim[-1])
                train_op.append(train_op[-1])
                q_runner.append(q_runner[-1])
                clones.append(clones[-1])
                clone_op.append(clone_op[-1])
            else:
                with tf.variable_scope('optim_%d' % k):
                    optim.append(
                        kfac.KfacOptimizer(learning_rate=PG_LR[k],
                                           clip_kl=kfac_clip,
                                           momentum=0.9,
                                           kfac_update=1,
                                           epsilon=0.01,
                                           stats_decay=0.99,
                                           async_var=0,
                                           cold_iter=10,
                                           max_grad_norm=max_grad_norm))
                    update_stats_op.append(optim[k].compute_and_apply_stats(
                        joint_fisher_loss, var_list=params[k]))
                    train_op_, q_runner_ = optim[k].apply_gradients(
                        list(zip(grads[k], params[k])))
                    train_op.append(train_op_)
                    q_runner.append(q_runner_)

                with tf.variable_scope('clone_%d' % k):
                    clones.append(
                        kfac.KfacOptimizer(learning_rate=PG_LR[k],
                                           clip_kl=kfac_clip,
                                           momentum=0.9,
                                           kfac_update=1,
                                           epsilon=0.01,
                                           stats_decay=0.99,
                                           async_var=0,
                                           cold_iter=10,
                                           max_grad_norm=max_grad_norm))
                    update_stats_op.append(clones[k].compute_and_apply_stats(
                        pg_fisher_loss[k], var_list=self.policy_params[k]))
                    clone_op_, q_runner_ = clones[k].apply_gradients(
                        list(zip(clone_grads[k], self.policy_params[k])))
                    clone_op.append(clone_op_)

        update_stats_op = tf.group(*update_stats_op)
        train_ops = train_op
        clone_ops = clone_op
        train_op = tf.group(*train_op)
        clone_op = tf.group(*clone_op)

        self.q_runner = q_runner
        self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
        self.clone_lr = Scheduler(v=lr,
                                  nvalues=total_timesteps,
                                  schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = [rewards[k] - values[k] for k in range(num_agents)]
            for step in range(len(obs)):
                cur_lr = self.lr.value()

            ob = np.concatenate(obs, axis=1)

            td_map = {}
            for k in range(num_agents):
                if identical[k]:
                    continue
                new_map = {}
                if num_agents > 1:
                    action_v = []
                    for j in range(k, pointer[k]):
                        action_v.append(
                            np.concatenate([
                                multionehot(actions[i], self.n_actions[i])
                                for i in range(num_agents) if i != k
                            ],
                                           axis=1))
                    action_v = np.concatenate(action_v, axis=0)
                    new_map.update({train_model[k].A_v: action_v})
                    td_map.update({train_model[k].A_v: action_v})

                new_map.update({
                    train_model[k].X:
                    np.concatenate([obs[j] for j in range(k, pointer[k])],
                                   axis=0),
                    train_model[k].X_v:
                    np.concatenate([ob.copy() for j in range(k, pointer[k])],
                                   axis=0),
                    A[k]:
                    np.concatenate([actions[j] for j in range(k, pointer[k])],
                                   axis=0),
                    ADV[k]:
                    np.concatenate([advs[j] for j in range(k, pointer[k])],
                                   axis=0),
                    R[k]:
                    np.concatenate([rewards[j] for j in range(k, pointer[k])],
                                   axis=0),
                    PG_LR[k]:
                    cur_lr / float(scale[k])
                })
                sess.run(train_ops[k], feed_dict=new_map)
                td_map.update(new_map)

                if states[k] != []:
                    td_map[train_model[k].S] = states
                    td_map[train_model[k].M] = masks

            policy_loss, value_loss, policy_entropy = sess.run(
                [pg_loss, vf_loss, entropy], td_map)
            return policy_loss, value_loss, policy_entropy

        def clone(obs, actions):
            td_map = {}
            cur_lr = self.clone_lr.value()
            for k in range(num_agents):
                if identical[k]:
                    continue
                new_map = {}
                new_map.update({
                    train_model[k].X:
                    np.concatenate([obs[j] for j in range(k, pointer[k])],
                                   axis=0),
                    A[k]:
                    np.concatenate([actions[j] for j in range(k, pointer[k])],
                                   axis=0),
                    PG_LR[k]:
                    cur_lr / float(scale[k])
                })
                sess.run(clone_ops[k], feed_dict=new_map)
                td_map.update(new_map)
            lld_loss = sess.run([lld], td_map)
            return lld_loss

        def get_log_action_prob(obs, actions):
            action_prob = []
            for k in range(num_agents):
                if identical[k]:
                    continue
                new_map = {
                    train_model[k].X:
                    np.concatenate([obs[j] for j in range(k, pointer[k])],
                                   axis=0),
                    A[k]:
                    np.concatenate([actions[j] for j in range(k, pointer[k])],
                                   axis=0)
                }
                log_pac = sess.run(self.log_pac[k], feed_dict=new_map)
                if scale[k] == 1:
                    action_prob.append(log_pac)
                else:
                    log_pac = np.split(log_pac, scale[k], axis=0)
                    action_prob += log_pac
            return action_prob

        self.get_log_action_prob = get_log_action_prob

        def get_log_action_prob_step(obs, actions):
            action_prob = []
            for k in range(num_agents):
                action_prob.append(step_model[k].step_log_prob(
                    obs[k], actions[k]))
            return action_prob

        self.get_log_action_prob_step = get_log_action_prob_step

        def save(save_path):
            ps = sess.run(params_flat)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params_flat, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)

        self.train = train
        self.clone = clone
        self.save = save
        self.load = load
        self.train_model = train_model
        self.step_model = step_model

        def step(ob, av, *_args, **_kwargs):
            a, v, s = [], [], []
            obs = np.concatenate(ob, axis=1)
            for k in range(num_agents):
                a_v = np.concatenate([
                    multionehot(av[i], self.n_actions[i])
                    for i in range(num_agents) if i != k
                ],
                                     axis=1)
                a_, v_, s_ = step_model[k].step(ob[k], obs, a_v)
                a.append(a_)
                v.append(v_)
                s.append(s_)
            return a, v, s

        self.step = step

        def value(obs, av):
            v = []
            ob = np.concatenate(obs, axis=1)
            for k in range(num_agents):
                a_v = np.concatenate([
                    multionehot(av[i], self.n_actions[i])
                    for i in range(num_agents) if i != k
                ],
                                     axis=1)
                v_ = step_model[k].value(ob, a_v)
                v.append(v_)
            return v

        self.value = value
        self.initial_state = [
            step_model[k].initial_state for k in range(num_agents)
        ]

Example #7

Show file

File: discriminator.py Project: kvas7andy/multiagent-gail_wsjeon

    def __init__(self,
                 sess,
                 ob_spaces,
                 ac_spaces,
                 nstack,
                 index,
                 disc_type='decentralized',
                 hidden_size=128,
                 gp_coef=5,
                 lr_rate=5e-4,
                 total_steps=50000,
                 scope="discriminator"):
        self.lr = Scheduler(v=lr_rate,
                            nvalues=total_steps * 20,
                            schedule='linear')
        self.disc_type = disc_type
        if disc_type not in disc_types:
            assert False
        self.scope = scope
        self.index = index
        self.sess = sess
        ob_space = ob_spaces[index]
        ac_space = ac_spaces[index]
        self.ob_shape = ob_space.shape[0] * nstack
        nact = ac_space.n
        self.ac_shape = nact * nstack
        self.all_ob_shape = sum([obs.shape[0] for obs in ob_spaces]) * nstack
        self.all_ac_shape = sum([ac.n for ac in ac_spaces]) * nstack
        self.hidden_size = hidden_size

        if disc_type == 'decentralized':
            input_shape = self.all_ob_shape + self.ac_shape
        elif disc_type == 'centralized':
            input_shape = self.all_ob_shape + self.all_ac_shape
        elif disc_type == 'single':
            input_shape = self.all_ob_shape + self.all_ac_shape
        else:
            assert False

        self.g = tf.placeholder(tf.float32, (None, input_shape))
        self.e = tf.placeholder(tf.float32, (None, input_shape))
        self.lr_rate = tf.placeholder(tf.float32, ())

        num_outputs = len(ob_spaces) if disc_type == 'centralized' else 1
        self.bias = tf.get_variable(name=scope + '_bias',
                                    shape=(num_outputs, ),
                                    initializer=tf.zeros_initializer,
                                    trainable=False)
        self.bias_ph = tf.placeholder(tf.float32, (num_outputs, ))
        self.update_bias = tf.assign(self.bias,
                                     self.bias_ph * 0.01 + self.bias * 0.99)

        generator_logits = self.build_graph(self.g, num_outputs, reuse=False)
        expert_logits = self.build_graph(self.e, num_outputs, reuse=True)

        self.generator_loss = tf.reduce_mean(generator_logits, axis=0)
        self.expert_loss = tf.reduce_mean(expert_logits, axis=0)

        ddg = tf.gradients(generator_logits, [self.g])
        ddg = tf.sqrt(tf.reduce_sum(tf.square(ddg[0]), axis=1))
        self.ddg = tf.reduce_mean(tf.square(ddg - 1.))

        dde = tf.gradients(expert_logits, [self.e])
        dde = tf.sqrt(tf.reduce_sum(tf.square(dde[0]), axis=1))
        self.dde = tf.reduce_mean(tf.square(dde - 1.))

        epsilon = tf.random_uniform([], 0.0, 1.0)
        ge = self.g * epsilon + self.e * (1 - epsilon)
        gel = self.build_graph(ge, num_outputs, reuse=True)
        ddd = tf.gradients(gel, [ge])
        ddd = tf.norm(ddd, axis=1)
        self.ddd = tf.reduce_mean(tf.square(ddd - 1.))

        self.total_loss = self.generator_loss - self.expert_loss + gp_coef * self.ddd  #(self.ddg + self.dde)
        self.reward_op = generator_logits

        self.var_list = self.get_trainable_variables()
        self.d_optim = tf.train.AdamOptimizer(self.lr_rate,
                                              beta1=0.5,
                                              beta2=0.9).minimize(
                                                  self.total_loss,
                                                  var_list=self.var_list)
        self.saver = tf.train.Saver(self.get_variables())

Example #8

Show file

File: discriminator.py Project: kvas7andy/multiagent-gail_wsjeon

class Discriminator(object):
    def __init__(self,
                 sess,
                 ob_spaces,
                 ac_spaces,
                 nstack,
                 index,
                 disc_type='decentralized',
                 hidden_size=128,
                 gp_coef=5,
                 lr_rate=5e-4,
                 total_steps=50000,
                 scope="discriminator"):
        self.lr = Scheduler(v=lr_rate,
                            nvalues=total_steps * 20,
                            schedule='linear')
        self.disc_type = disc_type
        if disc_type not in disc_types:
            assert False
        self.scope = scope
        self.index = index
        self.sess = sess
        ob_space = ob_spaces[index]
        ac_space = ac_spaces[index]
        self.ob_shape = ob_space.shape[0] * nstack
        nact = ac_space.n
        self.ac_shape = nact * nstack
        self.all_ob_shape = sum([obs.shape[0] for obs in ob_spaces]) * nstack
        self.all_ac_shape = sum([ac.n for ac in ac_spaces]) * nstack
        self.hidden_size = hidden_size

        if disc_type == 'decentralized':
            input_shape = self.all_ob_shape + self.ac_shape
        elif disc_type == 'centralized':
            input_shape = self.all_ob_shape + self.all_ac_shape
        elif disc_type == 'single':
            input_shape = self.all_ob_shape + self.all_ac_shape
        else:
            assert False

        self.g = tf.placeholder(tf.float32, (None, input_shape))
        self.e = tf.placeholder(tf.float32, (None, input_shape))
        self.lr_rate = tf.placeholder(tf.float32, ())

        num_outputs = len(ob_spaces) if disc_type == 'centralized' else 1
        self.bias = tf.get_variable(name=scope + '_bias',
                                    shape=(num_outputs, ),
                                    initializer=tf.zeros_initializer,
                                    trainable=False)
        self.bias_ph = tf.placeholder(tf.float32, (num_outputs, ))
        self.update_bias = tf.assign(self.bias,
                                     self.bias_ph * 0.01 + self.bias * 0.99)

        generator_logits = self.build_graph(self.g, num_outputs, reuse=False)
        expert_logits = self.build_graph(self.e, num_outputs, reuse=True)

        self.generator_loss = tf.reduce_mean(generator_logits, axis=0)
        self.expert_loss = tf.reduce_mean(expert_logits, axis=0)

        ddg = tf.gradients(generator_logits, [self.g])
        ddg = tf.sqrt(tf.reduce_sum(tf.square(ddg[0]), axis=1))
        self.ddg = tf.reduce_mean(tf.square(ddg - 1.))

        dde = tf.gradients(expert_logits, [self.e])
        dde = tf.sqrt(tf.reduce_sum(tf.square(dde[0]), axis=1))
        self.dde = tf.reduce_mean(tf.square(dde - 1.))

        epsilon = tf.random_uniform([], 0.0, 1.0)
        ge = self.g * epsilon + self.e * (1 - epsilon)
        gel = self.build_graph(ge, num_outputs, reuse=True)
        ddd = tf.gradients(gel, [ge])
        ddd = tf.norm(ddd, axis=1)
        self.ddd = tf.reduce_mean(tf.square(ddd - 1.))

        self.total_loss = self.generator_loss - self.expert_loss + gp_coef * self.ddd  #(self.ddg + self.dde)
        self.reward_op = generator_logits

        self.var_list = self.get_trainable_variables()
        self.d_optim = tf.train.AdamOptimizer(self.lr_rate,
                                              beta1=0.5,
                                              beta2=0.9).minimize(
                                                  self.total_loss,
                                                  var_list=self.var_list)
        self.saver = tf.train.Saver(self.get_variables())

    def build_graph(self, x, num_outputs=1, reuse=False):
        with tf.variable_scope(self.scope):
            if reuse:
                tf.get_variable_scope().reuse_variables()
            p_h1 = fc(x, 'fc1', nh=self.hidden_size)
            p_h2 = fc(p_h1, 'fc2', nh=self.hidden_size)
            p_h3 = fc(p_h2, 'fc3', nh=self.hidden_size)
            logits = fc(p_h3, 'out', nh=num_outputs, act=lambda x: x)
            logits -= self.bias
        return logits

    def get_variables(self):
        return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)

    def get_trainable_variables(self):
        return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)

    def get_reward(self, all_obs, acs):
        if len(all_obs.shape) == 1:
            all_obs = np.expand_dims(all_obs, 0)
        if len(acs.shape) == 1:
            acs = np.expand_dims(acs, 0)
        feed_dict = {self.g: np.concatenate([all_obs, acs], axis=1)}
        return self.sess.run(self.reward_op, feed_dict)

    def train(self, g_all_obs, g_acs, e_all_obs, e_acs):
        feed_dict = {
            self.g: np.concatenate([g_all_obs, g_acs], axis=1),
            self.e: np.concatenate([e_all_obs, e_acs], axis=1),
            self.lr_rate: self.lr.value()
        }
        gl, el, _ = self.sess.run(
            [self.generator_loss, self.expert_loss, self.d_optim], feed_dict)
        # self.sess.run(self.update_bias, feed_dict={self.bias_ph: (gl + el) / 2.0})
        return self.sess.run(
            [self.generator_loss, self.expert_loss, self.ddg, self.dde],
            feed_dict)

    def restore(self, path):
        print('restoring from:' + path)
        self.saver.restore(self.sess, path)