コード例 #1
0
ファイル: acktr_disc.py プロジェクト: xiaoyuanzh/baselines
class Model(object):
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 total_timesteps,
                 nprocs=32,
                 nsteps=20,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 vf_fisher_coef=1.0,
                 lr=0.25,
                 max_grad_norm=0.5,
                 kfac_clip=0.001,
                 lrschedule='linear'):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=nprocs,
                                inter_op_parallelism_threads=nprocs)
        config.gpu_options.allow_growth = True
        self.sess = sess = tf.Session(config=config)
        nact = ac_space.n
        nbatch = nenvs * nsteps
        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        PG_LR = tf.placeholder(tf.float32, [])
        VF_LR = tf.placeholder(tf.float32, [])

        self.model = step_model = policy(sess,
                                         ob_space,
                                         ac_space,
                                         nenvs,
                                         1,
                                         reuse=False)
        self.model2 = train_model = policy(sess,
                                           ob_space,
                                           ac_space,
                                           nenvs * nsteps,
                                           nsteps,
                                           reuse=True)

        logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.pi, labels=A)
        self.logits = logits = train_model.pi

        ##training loss
        pg_loss = tf.reduce_mean(ADV * logpac)
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        pg_loss = pg_loss - ent_coef * entropy
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        train_loss = pg_loss + vf_coef * vf_loss

        ##Fisher loss construction
        self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac)
        sample_net = train_model.vf + tf.random_normal(tf.shape(
            train_model.vf))
        self.vf_fisher = vf_fisher_loss = -vf_fisher_coef * tf.reduce_mean(
            tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2))
        self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss

        self.params = params = find_trainable_variables("model")

        self.grads_check = grads = tf.gradients(train_loss, params)

        with tf.device('/gpu:0'):
            self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\
                momentum=0.9, kfac_update=1, epsilon=0.01,\
                stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm)

            update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss,
                                                            var_list=params)
            train_op, q_runner = optim.apply_gradients(list(zip(grads,
                                                                params)))
        self.q_runner = q_runner
        self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = self.lr.value()

            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                PG_LR: cur_lr
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, train_op], td_map)
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)

        self.train = train
        self.save = save
        self.load = load
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        tf.global_variables_initializer().run(session=sess)
コード例 #2
0
class Model(object):
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 n_envs,
                 total_timesteps,
                 nprocs=32,
                 n_steps=20,
                 ent_coef=0.01,
                 vf_coef=0.25,
                 vf_fisher_coef=1.0,
                 learning_rate=0.25,
                 max_grad_norm=0.5,
                 kfac_clip=0.001,
                 lr_schedule='linear'):
        """
        The ACKTR (Actor Critic using Kronecker-Factored Trust Region) model class, https://arxiv.org/abs/1708.05144

        :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...)
        :param ob_space: (Gym Space) The observation space
        :param ac_space: (Gym Space) The action space
        :param n_envs: (int) The number of environments
        :param total_timesteps: (int) The total number of timesteps for training the model
        :param nprocs: (int) The number of threads for TensorFlow operations
        :param n_steps: (int) The number of steps to run for each environment
        :param ent_coef: (float) The weight for the entropic loss
        :param vf_coef: (float) The weight for the loss on the value function
        :param vf_fisher_coef: (float) The weight for the fisher loss on the value function
        :param learning_rate: (float) The initial learning rate for the RMS prop optimizer
        :param max_grad_norm: (float) The clipping value for the maximum gradient
        :param kfac_clip: (float) gradient clipping for Kullback leiber
        :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant',
                                 'double_linear_con', 'middle_drop' or 'double_middle_drop')
        """

        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=nprocs,
                                inter_op_parallelism_threads=nprocs)
        config.gpu_options.allow_growth = True
        self.sess = sess = tf.Session(config=config)
        n_batch = n_envs * n_steps
        action_ph = tf.placeholder(tf.int32, [n_batch])
        advs_ph = tf.placeholder(tf.float32, [n_batch])
        rewards_ph = tf.placeholder(tf.float32, [n_batch])
        pg_lr_ph = tf.placeholder(tf.float32, [])

        self.model = step_model = policy(sess,
                                         ob_space,
                                         ac_space,
                                         n_envs,
                                         1,
                                         reuse=False)
        self.model2 = train_model = policy(sess,
                                           ob_space,
                                           ac_space,
                                           n_envs * n_steps,
                                           n_steps,
                                           reuse=True)

        logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.policy, labels=action_ph)
        self.logits = train_model.policy

        # training loss
        pg_loss = tf.reduce_mean(advs_ph * logpac)
        entropy = tf.reduce_mean(calc_entropy(train_model.policy))
        pg_loss = pg_loss - ent_coef * entropy
        vf_loss = mse(tf.squeeze(train_model.value_fn), rewards_ph)
        train_loss = pg_loss + vf_coef * vf_loss

        # Fisher loss construction
        self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac)
        sample_net = train_model.value_fn + tf.random_normal(
            tf.shape(train_model.value_fn))
        self.vf_fisher = vf_fisher_loss = -vf_fisher_coef * tf.reduce_mean(
            tf.pow(train_model.value_fn - tf.stop_gradient(sample_net), 2))
        self.joint_fisher = pg_fisher_loss + vf_fisher_loss

        self.params = params = find_trainable_variables("model")

        self.grads_check = grads = tf.gradients(train_loss, params)

        with tf.device('/gpu:0'):
            self.optim = optim = kfac.KfacOptimizer(
                learning_rate=pg_lr_ph,
                clip_kl=kfac_clip,
                momentum=0.9,
                kfac_update=1,
                epsilon=0.01,
                stats_decay=0.99,
                async=1,
                cold_iter=10,
                max_grad_norm=max_grad_norm)

            optim.compute_and_apply_stats(self.joint_fisher, var_list=params)
            train_op, q_runner = optim.apply_gradients(list(zip(grads,
                                                                params)))
        self.q_runner = q_runner
        self.learning_rate = Scheduler(initial_value=learning_rate,
                                       n_values=total_timesteps,
                                       schedule=lr_schedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for _ in range(len(obs)):
                cur_lr = self.learning_rate.value()

            td_map = {
                train_model.obs_ph: obs,
                action_ph: actions,
                advs_ph: advs,
                rewards_ph: rewards,
                pg_lr_ph: cur_lr
            }
            if states is not None:
                td_map[train_model.states_ph] = states
                td_map[train_model.masks_ph] = masks

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, train_op], td_map)
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            session_params = sess.run(params)
            joblib.dump(session_params, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for param, loaded_p in zip(params, loaded_params):
                restores.append(param.assign(loaded_p))
            sess.run(restores)

        self.train = train
        self.save = save
        self.load = load
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        tf.global_variables_initializer().run(session=sess)
コード例 #3
0
class Model(object):
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 total_timesteps,
                 nprocs=32,
                 nsteps=20,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 vf_fisher_coef=1.0,
                 lr=0.25,
                 max_grad_norm=0.5,
                 kfac_clip=0.001,
                 lrschedule='linear',
                 is_async=True):

        self.sess = sess = get_session()
        nbatch = nenvs * nsteps
        A = tf.placeholder(ac_space.dtype, [
            nbatch,
        ] + list(ac_space.shape))
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        PG_LR = tf.placeholder(tf.float32, [])
        VF_LR = tf.placeholder(tf.float32, [])

        with tf.variable_scope('acktr_model', reuse=tf.AUTO_REUSE):
            self.model = step_model = policy(nenvs, 1, sess=sess)
            self.model2 = train_model = policy(nenvs * nsteps,
                                               nsteps,
                                               sess=sess)

        neglogpac = train_model.pd.neglogp(A)
        self.logits = train_model.pi

        ##training loss
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        entropy = tf.reduce_mean(train_model.pd.entropy())
        pg_loss = pg_loss - ent_coef * entropy
        vf_loss = tf.losses.mean_squared_error(tf.squeeze(train_model.vf), R)
        train_loss = pg_loss + vf_coef * vf_loss

        ##Fisher loss construction
        self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(neglogpac)
        sample_net = train_model.vf + tf.random_normal(tf.shape(
            train_model.vf))
        self.vf_fisher = vf_fisher_loss = -vf_fisher_coef * tf.reduce_mean(
            tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2))
        self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss

        self.params = params = find_trainable_variables("acktr_model")

        self.grads_check = grads = tf.gradients(train_loss, params)

        with tf.device('/gpu:0'):
            self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\
                momentum=0.9, kfac_update=1, epsilon=0.01,\
                stats_decay=0.99, is_async=is_async, cold_iter=10, max_grad_norm=max_grad_norm)

            # update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params)
            optim.compute_and_apply_stats(joint_fisher_loss, var_list=params)
            train_op, q_runner = optim.apply_gradients(list(zip(grads,
                                                                params)))
        self.q_runner = q_runner
        self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = self.lr.value()

            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                PG_LR: cur_lr,
                VF_LR: cur_lr
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, train_op], td_map)
            return policy_loss, value_loss, policy_entropy

        self.train = train
        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        tf.global_variables_initializer().run(session=sess)
コード例 #4
0
class Model(object):
  def __init__(self,
               policy,
               ob_space,
               ac_space,
               nenvs,
               total_timesteps,
               nprocs=32,
               nscripts=16,
               nsteps=20,
               nstack=4,
               ent_coef=0.1,
               vf_coef=0.5,
               vf_fisher_coef=1.0,
               lr=0.25,
               max_grad_norm=0.001,
               kfac_clip=0.001,
               lrschedule='linear',
               alpha=0.99,
               epsilon=1e-5):
    config = tf.ConfigProto(
        allow_soft_placement=True,
        intra_op_parallelism_threads=nprocs,
        inter_op_parallelism_threads=nprocs)
    config.gpu_options.allow_growth = True
    self.sess = sess = tf.Session(config=config)
    nsml.bind(sess=sess)
    #nact = ac_space.n
    nbatch = nenvs * nsteps
    A = tf.placeholder(tf.int32, [nbatch])

    XY0 = tf.placeholder(tf.int32, [nbatch])
    XY1 = tf.placeholder(tf.int32, [nbatch])

    # ADV == TD_TARGET - values
    ADV = tf.placeholder(tf.float32, [nbatch])
    TD_TARGET = tf.placeholder(tf.float32, [nbatch])
    PG_LR = tf.placeholder(tf.float32, [])
    VF_LR = tf.placeholder(tf.float32, [])

    self.model = step_model = policy(
        sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
    self.model2 = train_model = policy(
        sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True)

    # Policy 1 : Base Action : train_model.pi label = A

    script_mask = tf.concat(
        [
            tf.zeros([nscripts * nsteps, 1]),
            tf.ones([(nprocs - nscripts) * nsteps, 1])
        ],
        axis=0)

    pi = train_model.pi
    pac_weight = script_mask * (tf.nn.softmax(pi) - 1.0) + 1.0
    pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(A, depth=3), axis=1)
    neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=pi, labels=A)
    neglogpac *= tf.stop_gradient(pac_weight)

    inv_A = 1.0 - tf.cast(A, tf.float32)

    xy0_mask = tf.cast(A, tf.float32)
    xy1_mask = tf.cast(A, tf.float32)

    condition0 = tf.equal(xy0_mask, 2)
    xy0_mask = tf.where(condition0, tf.ones(tf.shape(xy0_mask)), xy0_mask)
    xy0_mask = 1.0 - xy0_mask

    condition1 = tf.equal(xy1_mask, 2)
    xy1_mask = tf.where(condition1, tf.zeros(tf.shape(xy1_mask)), xy1_mask)

    # One hot representation of chosen marine.
    # [batch_size, 2]
    pi_xy0 = train_model.pi_xy0
    pac_weight = script_mask * (tf.nn.softmax(pi_xy0) - 1.0) + 1.0
    pac_weight = tf.reduce_sum(
        pac_weight * tf.one_hot(XY0, depth=1024), axis=1)

    logpac_xy0 = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=pi_xy0, labels=XY0)
    logpac_xy0 *= tf.stop_gradient(pac_weight)
    logpac_xy0 *= tf.cast(xy0_mask, tf.float32)

    pi_xy1 = train_model.pi_xy1
    pac_weight = script_mask * (tf.nn.softmax(pi_xy1) - 1.0) + 1.0
    pac_weight = tf.reduce_sum(
        pac_weight * tf.one_hot(XY0, depth=1024), axis=1)

    # 1D? 2D?
    logpac_xy1 = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=pi_xy1, labels=XY1)
    logpac_xy1 *= tf.stop_gradient(pac_weight)
    logpac_xy1 *= tf.cast(xy1_mask, tf.float32)

    pg_loss = tf.reduce_mean(ADV * neglogpac)
    pg_loss_xy0 = tf.reduce_mean(ADV * logpac_xy0)
    pg_loss_xy1 = tf.reduce_mean(ADV * logpac_xy1)

    vf_ = tf.squeeze(train_model.vf)

    vf_r = tf.concat(
        [
            tf.ones([nscripts * nsteps, 1]),
            tf.zeros([(nprocs - nscripts) * nsteps, 1])
        ],
        axis=0) * TD_TARGET
    vf_masked = vf_ * script_mask + vf_r

    #vf_mask[0:nscripts * nsteps] = R[0:nscripts * nsteps]

    vf_loss = tf.reduce_mean(mse(vf_masked, TD_TARGET))
    entropy_a = tf.reduce_mean(cat_entropy(train_model.pi))
    entropy_xy0 = tf.reduce_mean(cat_entropy(train_model.pi_xy0))
    entropy_xy1 = tf.reduce_mean(cat_entropy(train_model.pi_xy1))
    entropy = entropy_a + entropy_xy0 + entropy_xy1

    loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

    params = find_trainable_variables("model")
    grads = tf.gradients(loss, params)
    if max_grad_norm is not None:
      grads, _ = tf.clip_by_global_norm(grads, max_grad_norm)
    grads = list(zip(grads, params))
    trainer = tf.train.RMSPropOptimizer(
        learning_rate=lr, decay=alpha, epsilon=epsilon)
    _train = trainer.apply_gradients(grads)

    self.logits = logits = train_model.pi

    # xy0

    self.params_common = params_common = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/common')
    self.params_xy0 = params_xy0 = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES,
        scope='model/xy0') + params_common

    train_loss_xy0 = pg_loss_xy0 - entropy * ent_coef + vf_coef * vf_loss

    self.grads_check_xy0 = grads_xy0 = tf.gradients(
        train_loss_xy0, params_xy0)
    if max_grad_norm is not None:
      grads_xy0, _ = tf.clip_by_global_norm(grads_xy0, max_grad_norm)

    grads_xy0 = list(zip(grads_xy0, params_xy0))
    trainer_xy0 = tf.train.RMSPropOptimizer(
        learning_rate=lr, decay=alpha, epsilon=epsilon)
    _train_xy0 = trainer_xy0.apply_gradients(grads_xy0)

    # xy1

    self.params_xy1 = params_xy1 = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES,
        scope='model/xy1') + params_common

    train_loss_xy1 = pg_loss_xy1 - entropy * ent_coef + vf_coef * vf_loss

    self.grads_check_xy1 = grads_xy1 = tf.gradients(
        train_loss_xy1, params_xy1)
    if max_grad_norm is not None:
      grads_xy1, _ = tf.clip_by_global_norm(grads_xy1, max_grad_norm)

    grads_xy1 = list(zip(grads_xy1, params_xy1))
    trainer_xy1 = tf.train.RMSPropOptimizer(
        learning_rate=lr, decay=alpha, epsilon=epsilon)
    _train_xy1 = trainer_xy1.apply_gradients(grads_xy1)

    self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

    def train(obs, states, td_targets, masks, actions, xy0, xy1, values):
      advs = td_targets - values
      for step in range(len(obs)):
        cur_lr = self.lr.value()

      td_map = {
          train_model.X: obs,
          A: actions,
          XY0: xy0,
          XY1: xy1,
          ADV: advs,
          TD_TARGET: td_targets,
          PG_LR: cur_lr
      }
      if states != []:
        td_map[train_model.S] = states
        td_map[train_model.M] = masks

      policy_loss, value_loss, policy_entropy, _, \
      policy_loss_xy0, policy_entropy_xy0, _, \
      policy_loss_xy1, policy_entropy_xy1, _ = sess.run(
          [pg_loss, vf_loss, entropy, _train,
           pg_loss_xy0, entropy_xy0, _train_xy0,
           pg_loss_xy1, entropy_xy1, _train_xy1],
          td_map)
      return policy_loss, value_loss, policy_entropy, \
             policy_loss_xy0, policy_entropy_xy0, \
             policy_loss_xy1, policy_entropy_xy1

    def save(save_path):
      ps = sess.run(params)
      joblib.dump(ps, save_path)

    def load(load_path):
      loaded_params = joblib.load(load_path)
      restores = []
      for p, loaded_p in zip(params, loaded_params):
        restores.append(p.assign(loaded_p))
      sess.run(restores)

    self.train = train
    self.save = save
    self.load = load
    self.train_model = train_model
    self.step_model = step_model
    self.step = step_model.step
    self.value = step_model.value
    self.initial_state = step_model.initial_state
    print("global_variables_initializer start")
    tf.global_variables_initializer().run(session=sess)
    print("global_variables_initializer complete")
コード例 #5
0
def learn_hoof_a2c(
        network,
        env,
        seed=None,
        nsteps=5,
        total_timesteps=int(80e6),
        vf_coef=0.5,
        ent_coef=0.01,
        max_grad_norm=0.5,
        lr=7e-4,
        lrschedule='linear',
        epsilon=1e-5,
        alpha=0.99,
        gamma=0.99,
        log_interval=100,
        load_path=None,  # Baselines default settings till here
        optimiser='RMSProp',
        lr_upper_bound=None,
        ent_upper_bound=None,
        num_lr=None,
        num_ent_coeff=None,
        max_kl=-1.0,  # -1.0 is for no KL constraint
        **network_kwargs):
    '''
    Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm.

    Parameters:
    -----------

    network:            policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                        specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                        tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                        neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                        See baselines.common/policies.py/lstm for more details on using recurrent nets in policies


    env:                RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py)


    seed:               seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible)

    nsteps:             int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                        nenv is number of environment copies simulated in parallel)

    total_timesteps:    int, total number of timesteps to train on (default: 80M)

    max_gradient_norm:  float, gradient is clipped to have global L2 norm no more than this value (default: 0.5)

    lr:                 float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)

    gamma:              float, reward discounting parameter (default: 0.99)

    log_interval:       int, specifies how frequently the logs are printed out (default: 100)

    **network_kwargs:   keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                        For instance, 'mlp' network architecture has arguments num_hidden and num_layers.

    '''

    set_global_seeds(seed)

    # Get the nb of env
    nenvs = env.num_envs
    policy = build_policy(env, network, **network_kwargs)

    # overwrite default params if using HOOF
    if lr_upper_bound is not None:
        lr = 1.0
        lrschedule = 'constant'
    else:
        num_lr = 1

    if ent_upper_bound is None:
        num_ent_coeff = 1

    # Instantiate the model object (that creates step_model and train_model)
    model = HOOF_Model(
        policy=policy,
        env=env,
        nsteps=nsteps,
        optimiser=optimiser,
        ent_coef=ent_coef,
        vf_coef=vf_coef,
        max_grad_norm=max_grad_norm,
        total_timesteps=total_timesteps,
        alpha=alpha,
        epsilon=epsilon  # defaults for RMSProp
    )

    runner = HOOF_Runner(env, model, nsteps=nsteps, gamma=gamma)
    epinfobuf = deque(maxlen=100)

    lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

    # Calculate the batch_size
    nbatch = nenvs * nsteps

    # model helper functions
    model_params = find_trainable_variables("a2c_model")
    get_flat = U.GetFlat(model_params)
    set_from_flat = U.SetFromFlat(model_params)

    # for Gaussian policies
    def kl(new_mean, new_sd, old_mean, old_sd):
        approx_kl = np.log(new_sd / old_sd) + (
            old_sd**2 +
            (old_mean - new_mean)**2) / (2.0 * new_sd**2 + 10**-8) - 0.5
        approx_kl = np.sum(approx_kl, axis=1)
        approx_kl = np.mean(approx_kl)
        return approx_kl

    if max_kl == -1.0:  # set max kl to a high val in case there is no constraint
        max_kl = 10**8

    # Start total timer
    tstart = time.time()

    for update in range(1, int(total_timesteps // nbatch + 1)):
        obs, states, rewards, masks, actions, values, undisc_rwds, epinfos = runner.run(
        )
        epinfobuf.extend(epinfos)
        old_mean, old_sd, old_neg_ll = model.get_mean_std_neg_ll(obs, actions)
        for step in range(len(obs)):
            cur_lr = lr.value()

        opt_pol_val = -10**8
        old_params = get_flat()
        rms_weights_before_upd = model.get_opt_state()
        approx_kl = np.zeros((num_ent_coeff, num_lr))
        epv = np.zeros((num_ent_coeff, num_lr))
        rand_lr = lr_upper_bound * np.random.rand(
            num_lr) if lr_upper_bound is not None else [cur_lr]
        rand_lr = np.sort(rand_lr)
        rand_ent_coeff = ent_upper_bound * np.random.rand(
            num_ent_coeff) if ent_upper_bound is not None else [ent_coef]

        for nec in range(num_ent_coeff):
            # reset policy and optimiser
            set_from_flat(old_params)
            model.set_opt_state(rms_weights_before_upd)

            # get grads for loss fn with given entropy coeff
            policy_loss, value_loss, policy_entropy = model.train(
                obs, states, rewards, masks, actions, values,
                rand_ent_coeff[nec])
            new_params = get_flat()
            ent_grads = new_params - old_params

            # enumerate over different LR
            for nlr in range(num_lr):
                new_params = old_params + rand_lr[nlr] * ent_grads
                set_from_flat(new_params)
                new_mean, new_sd, new_neg_ll = model.get_mean_std_neg_ll(
                    obs, actions)
                lik_ratio = np.exp(-new_neg_ll + old_neg_ll)
                est_pol_val = wis_estimate(nenvs, nsteps, undisc_rwds,
                                           lik_ratio)
                approx_kl[nec, nlr] = kl(new_mean, new_sd, old_mean, old_sd)
                epv[nec, nlr] = est_pol_val

                if (nec == 0
                        and nlr == 0) or (est_pol_val > opt_pol_val
                                          and approx_kl[nec, nlr] < max_kl):
                    opt_pol_val = est_pol_val
                    opt_pol_params = get_flat()
                    opt_rms_wts = model.get_opt_state()
                    opt_lr = rand_lr[nlr]
                    opt_ent_coeff = rand_ent_coeff[nec]
                    opt_kl = approx_kl[nec, nlr]

        # update policy and rms prop to optimal wts
        set_from_flat(opt_pol_params)
        model.set_opt_state(opt_rms_wts)

        # Shrink LR search space if too many get rejected
        if lr_upper_bound is not None:
            rejections = np.sum(approx_kl > max_kl) / num_lr
            if rejections > 0.8:
                lr_upper_bound *= 0.8
            if rejections == 0:
                lr_upper_bound *= 1.25

        nseconds = time.time() - tstart

        # Calculate the fps (frame per second)
        fps = int((update * nbatch) / nseconds)
        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, rewards)
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update * nbatch)
            logger.record_tabular("fps", fps)
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular("explained_variance", float(ev))
            logger.record_tabular("opt_lr", float(opt_lr))
            logger.record_tabular("opt_ent_coeff", float(opt_ent_coeff))
            logger.record_tabular("approx_kl", float(opt_kl))
            if lr_upper_bound is not None:
                logger.record_tabular("rejections", rejections)
                logger.record_tabular("lr_ub", lr_upper_bound)
            logger.record_tabular(
                "eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.record_tabular(
                "eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.dump_tabular()
    return model
コード例 #6
0
ファイル: acktr.py プロジェクト: MrGoogol/baselines
class Model(object):

    def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20,
                 ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
                 kfac_clip=0.001, lrschedule='linear', is_async=True):

        self.sess = sess = get_session()
        nbatch = nenvs * nsteps
        with tf.variable_scope('acktr_model', reuse=tf.AUTO_REUSE):
            self.model = step_model = policy(nenvs, 1, sess=sess)
            self.model2 = train_model = policy(nenvs*nsteps, nsteps, sess=sess)

        A = train_model.pdtype.sample_placeholder([None])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        PG_LR = tf.placeholder(tf.float32, [])
        VF_LR = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)
        self.logits = train_model.pi

        ##training loss
        pg_loss = tf.reduce_mean(ADV*neglogpac)
        entropy = tf.reduce_mean(train_model.pd.entropy())
        pg_loss = pg_loss - ent_coef * entropy
        vf_loss = tf.losses.mean_squared_error(tf.squeeze(train_model.vf), R)
        train_loss = pg_loss + vf_coef * vf_loss


        ##Fisher loss construction
        self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(neglogpac)
        sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf))
        self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2))
        self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss

        self.params=params = find_trainable_variables("acktr_model")

        self.grads_check = grads = tf.gradients(train_loss,params)

        with tf.device('/gpu:0'):
            self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\
                momentum=0.9, kfac_update=1, epsilon=0.01,\
                stats_decay=0.99, is_async=is_async, cold_iter=10, max_grad_norm=max_grad_norm)

            # update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params)
            optim.compute_and_apply_stats(joint_fisher_loss, var_list=params)
            train_op, q_runner = optim.apply_gradients(list(zip(grads,params)))
        self.q_runner = q_runner
        self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = self.lr.value()

            td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, PG_LR:cur_lr, VF_LR:cur_lr}
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, train_op],
                td_map
            )
            return policy_loss, value_loss, policy_entropy


        self.train = train
        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        tf.global_variables_initializer().run(session=sess)
コード例 #7
0
ファイル: acktr_disc.py プロジェクト: Divyankpandey/baselines
class Model(object):

    def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20,
                 ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
                 kfac_clip=0.001, lrschedule='linear'):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=nprocs,
                                inter_op_parallelism_threads=nprocs)
        config.gpu_options.allow_growth = True
        self.sess = sess = tf.Session(config=config)
        nact = ac_space.n
        nbatch = nenvs * nsteps
        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        PG_LR = tf.placeholder(tf.float32, [])
        VF_LR = tf.placeholder(tf.float32, [])

        self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
        self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True)

        logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
        self.logits = logits = train_model.pi

        ##training loss
        pg_loss = tf.reduce_mean(ADV*logpac)
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        pg_loss = pg_loss - ent_coef * entropy
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        train_loss = pg_loss + vf_coef * vf_loss


        ##Fisher loss construction
        self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac)
        sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf))
        self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2))
        self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss

        self.params=params = find_trainable_variables("model")

        self.grads_check = grads = tf.gradients(train_loss,params)

        with tf.device('/gpu:0'):
            self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\
                momentum=0.9, kfac_update=1, epsilon=0.01,\
                stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm)

            update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params)
            train_op, q_runner = optim.apply_gradients(list(zip(grads,params)))
        self.q_runner = q_runner
        self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = self.lr.value()

            td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, PG_LR:cur_lr}
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, train_op],
                td_map
            )
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)



        self.train = train
        self.save = save
        self.load = load
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        tf.global_variables_initializer().run(session=sess)
コード例 #8
0
ファイル: a2c.py プロジェクト: moon0823/pysc2-examples
class Model(object):
  def __init__(self,
               policy,
               ob_space,
               ac_space,
               nenvs,
               total_timesteps,
               nprocs=32,
               nscripts=16,
               nsteps=20,
               nstack=4,
               ent_coef=0.1,
               vf_coef=0.5,
               vf_fisher_coef=1.0,
               lr=0.25,
               max_grad_norm=0.001,
               kfac_clip=0.001,
               lrschedule='linear',
               alpha=0.99,
               epsilon=1e-5):
    config = tf.ConfigProto(
        allow_soft_placement=True,
        intra_op_parallelism_threads=nprocs,
        inter_op_parallelism_threads=nprocs)
    config.gpu_options.allow_growth = True
    self.sess = sess = tf.Session(config=config)
    nsml.bind(sess=sess)
    #nact = ac_space.n
    nbatch = nenvs * nsteps
    A = tf.placeholder(tf.int32, [nbatch])

    XY0 = tf.placeholder(tf.int32, [nbatch])
    XY1 = tf.placeholder(tf.int32, [nbatch])

    # ADV == TD_TARGET - values
    ADV = tf.placeholder(tf.float32, [nbatch])
    TD_TARGET = tf.placeholder(tf.float32, [nbatch])
    PG_LR = tf.placeholder(tf.float32, [])
    VF_LR = tf.placeholder(tf.float32, [])

    self.model = step_model = policy(
        sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
    self.model2 = train_model = policy(
        sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True)

    # Policy 1 : Base Action : train_model.pi label = A

    script_mask = tf.concat(
        [
            tf.zeros([nscripts * nsteps, 1]),
            tf.ones([(nprocs - nscripts) * nsteps, 1])
        ],
        axis=0)

    pi = train_model.pi
    pac_weight = script_mask * (tf.nn.softmax(pi) - 1.0) + 1.0
    pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(A, depth=3), axis=1)
    neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=pi, labels=A)
    neglogpac *= tf.stop_gradient(pac_weight)

    inv_A = 1.0 - tf.cast(A, tf.float32)

    xy0_mask = tf.cast(A, tf.float32)
    xy1_mask = tf.cast(A, tf.float32)

    condition0 = tf.equal(xy0_mask, 2)
    xy0_mask = tf.where(condition0, tf.ones(tf.shape(xy0_mask)), xy0_mask)
    xy0_mask = 1.0 - xy0_mask

    condition1 = tf.equal(xy1_mask, 2)
    xy1_mask = tf.where(condition1, tf.zeros(tf.shape(xy1_mask)), xy1_mask)

    # One hot representation of chosen marine.
    # [batch_size, 2]
    pi_xy0 = train_model.pi_xy0
    pac_weight = script_mask * (tf.nn.softmax(pi_xy0) - 1.0) + 1.0
    pac_weight = tf.reduce_sum(
        pac_weight * tf.one_hot(XY0, depth=1024), axis=1)

    logpac_xy0 = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=pi_xy0, labels=XY0)
    logpac_xy0 *= tf.stop_gradient(pac_weight)
    logpac_xy0 *= tf.cast(xy0_mask, tf.float32)

    pi_xy1 = train_model.pi_xy1
    pac_weight = script_mask * (tf.nn.softmax(pi_xy1) - 1.0) + 1.0
    pac_weight = tf.reduce_sum(
        pac_weight * tf.one_hot(XY0, depth=1024), axis=1)

    # 1D? 2D?
    logpac_xy1 = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=pi_xy1, labels=XY1)
    logpac_xy1 *= tf.stop_gradient(pac_weight)
    logpac_xy1 *= tf.cast(xy1_mask, tf.float32)

    pg_loss = tf.reduce_mean(ADV * neglogpac)
    pg_loss_xy0 = tf.reduce_mean(ADV * logpac_xy0)
    pg_loss_xy1 = tf.reduce_mean(ADV * logpac_xy1)

    vf_ = tf.squeeze(train_model.vf)

    vf_r = tf.concat(
        [
            tf.ones([nscripts * nsteps, 1]),
            tf.zeros([(nprocs - nscripts) * nsteps, 1])
        ],
        axis=0) * TD_TARGET
    vf_masked = vf_ * script_mask + vf_r

    #vf_mask[0:nscripts * nsteps] = R[0:nscripts * nsteps]

    vf_loss = tf.reduce_mean(mse(vf_masked, TD_TARGET))
    entropy_a = tf.reduce_mean(cat_entropy(train_model.pi))
    entropy_xy0 = tf.reduce_mean(cat_entropy(train_model.pi_xy0))
    entropy_xy1 = tf.reduce_mean(cat_entropy(train_model.pi_xy1))
    entropy = entropy_a + entropy_xy0 + entropy_xy1

    loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

    params = find_trainable_variables("model")
    grads = tf.gradients(loss, params)
    if max_grad_norm is not None:
      grads, _ = tf.clip_by_global_norm(grads, max_grad_norm)
    grads = list(zip(grads, params))
    trainer = tf.train.RMSPropOptimizer(
        learning_rate=lr, decay=alpha, epsilon=epsilon)
    _train = trainer.apply_gradients(grads)

    self.logits = logits = train_model.pi

    # xy0

    self.params_common = params_common = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/common')
    self.params_xy0 = params_xy0 = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES,
        scope='model/xy0') + params_common

    train_loss_xy0 = pg_loss_xy0 - entropy * ent_coef + vf_coef * vf_loss

    self.grads_check_xy0 = grads_xy0 = tf.gradients(
        train_loss_xy0, params_xy0)
    if max_grad_norm is not None:
      grads_xy0, _ = tf.clip_by_global_norm(grads_xy0, max_grad_norm)

    grads_xy0 = list(zip(grads_xy0, params_xy0))
    trainer_xy0 = tf.train.RMSPropOptimizer(
        learning_rate=lr, decay=alpha, epsilon=epsilon)
    _train_xy0 = trainer_xy0.apply_gradients(grads_xy0)

    # xy1

    self.params_xy1 = params_xy1 = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES,
        scope='model/xy1') + params_common

    train_loss_xy1 = pg_loss_xy1 - entropy * ent_coef + vf_coef * vf_loss

    self.grads_check_xy1 = grads_xy1 = tf.gradients(
        train_loss_xy1, params_xy1)
    if max_grad_norm is not None:
      grads_xy1, _ = tf.clip_by_global_norm(grads_xy1, max_grad_norm)

    grads_xy1 = list(zip(grads_xy1, params_xy1))
    trainer_xy1 = tf.train.RMSPropOptimizer(
        learning_rate=lr, decay=alpha, epsilon=epsilon)
    _train_xy1 = trainer_xy1.apply_gradients(grads_xy1)

    self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

    def train(obs, states, td_targets, masks, actions, xy0, xy1, values):
      advs = td_targets - values
      for step in range(len(obs)):
        cur_lr = self.lr.value()

      td_map = {
          train_model.X: obs,
          A: actions,
          XY0: xy0,
          XY1: xy1,
          ADV: advs,
          TD_TARGET: td_targets,
          PG_LR: cur_lr
      }
      if states != []:
        td_map[train_model.S] = states
        td_map[train_model.M] = masks

      policy_loss, value_loss, policy_entropy, _, \
      policy_loss_xy0, policy_entropy_xy0, _, \
      policy_loss_xy1, policy_entropy_xy1, _ = sess.run(
          [pg_loss, vf_loss, entropy, _train,
           pg_loss_xy0, entropy_xy0, _train_xy0,
           pg_loss_xy1, entropy_xy1, _train_xy1],
          td_map)
      return policy_loss, value_loss, policy_entropy, \
             policy_loss_xy0, policy_entropy_xy0, \
             policy_loss_xy1, policy_entropy_xy1

    def save(save_path):
      ps = sess.run(params)
      joblib.dump(ps, save_path)

    def load(load_path):
      loaded_params = joblib.load(load_path)
      restores = []
      for p, loaded_p in zip(params, loaded_params):
        restores.append(p.assign(loaded_p))
      sess.run(restores)

    self.train = train
    self.save = save
    self.load = load
    self.train_model = train_model
    self.step_model = step_model
    self.step = step_model.step
    self.value = step_model.value
    self.initial_state = step_model.initial_state
    print("global_variables_initializer start")
    tf.global_variables_initializer().run(session=sess)
    print("global_variables_initializer complete")