예제 #1
0
        def update_loss_trainer(progress):
            # Update all coef regarding progression
            pg_coef = np.clip(self.start_pg_coef * (1.0 - 2.0 * progress), 0.0,
                              1.0)
            st_coef = np.clip(self.start_st_coef * (1.0 - 2.0 * progress), 0.0,
                              1.0)
            pg_lt_coef = np.clip(self.start_pg_lt_coef + 2.0 * progress,
                                 self.start_pg_lt_coef, 1.0)

            loss = pg_coef * pg_loss - entropy * ent_coef + (
                vf_loss + vf_loss_lt
            ) * vf_coef + st_coef * st_loss + pg_lt_coef * pg_loss_lt

            # Update parameters using loss
            # 1. Get the model parameters
            params = find_trainable_variables("a2c_model")

            # 2. Calculate the gradients
            grads = tf.gradients(loss, params)
            if self.max_grad_norm is not None:
                # Clip the gradients (normalize)
                grads, grad_norm = tf.clip_by_global_norm(
                    grads, self.max_grad_norm)
            grads = list(zip(grads, params))

            _train = trainer.apply_gradients(grads)

            return pg_coef, st_coef, pg_lt_coef
예제 #2
0
 def add_noise(self, mean=0.0, stddev=0.01):
     sess = tf_util.get_session()
     params = find_trainable_variables(self.model_name)
     for param in params:
         variables_shape = tf.shape(param)
         noise = tf.random_normal(
             variables_shape,
             mean=mean,
             stddev=stddev,
             dtype=tf.float32,
         )
         sess.run(tf.assign_add(param, noise))
예제 #3
0
        def get_train_op(loss_op):
            params = find_trainable_variables("model")

            # Switch from GATE_NONE to GATE_GRAPH to enhance reproducibility.
            #grads = tf.gradients(loss, params)
            grads_and_params = trainer.compute_gradients(
                loss=loss_op,
                var_list=params,
                gate_gradients=tf.train.RMSPropOptimizer.GATE_GRAPH)
            grads = [x[0] for x in grads_and_params]
            params = [x[1] for x in grads_and_params]

            if max_grad_norm is not None:
                grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
            grads = list(zip(grads, params))

            return trainer.apply_gradients(grads)
예제 #4
0
    def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack,
                 num_procs, ent_coef, q_coef, gamma, max_grad_norm, lr,
                 rprop_alpha, rprop_epsilon, total_timesteps, lrschedule, c,
                 trust_region, alpha, delta):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=num_procs,
                                inter_op_parallelism_threads=num_procs)
        sess = tf.Session(config=config)
        nact = ac_space.n
        nbatch = nenvs * nsteps

        A = tf.placeholder(tf.int32, [nbatch])  # actions
        D = tf.placeholder(tf.float32, [nbatch])  # dones
        R = tf.placeholder(tf.float32, [nbatch])  # rewards, not returns
        MU = tf.placeholder(tf.float32, [nbatch, nact])  # mu's
        LR = tf.placeholder(tf.float32, [])
        eps = 1e-6

        step_model = policy(sess,
                            ob_space,
                            ac_space,
                            nenvs,
                            1,
                            nstack,
                            reuse=False)
        train_model = policy(sess,
                             ob_space,
                             ac_space,
                             nenvs,
                             nsteps + 1,
                             nstack,
                             reuse=True)

        params = find_trainable_variables("model")
        print("Params {}".format(len(params)))
        for var in params:
            print(var)

        # create polyak averaged model
        ema = tf.train.ExponentialMovingAverage(alpha)
        ema_apply_op = ema.apply(params)

        def custom_getter(getter, *args, **kwargs):
            v = ema.average(getter(*args, **kwargs))
            print(v.name)
            return v

        with tf.variable_scope("", custom_getter=custom_getter, reuse=True):
            polyak_model = policy(sess,
                                  ob_space,
                                  ac_space,
                                  nenvs,
                                  nsteps + 1,
                                  nstack,
                                  reuse=True)

        # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i
        v = tf.reduce_sum(train_model.pi * train_model.q,
                          axis=-1)  # shape is [nenvs * (nsteps + 1)]

        # strip off last step
        f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps),
                          [train_model.pi, polyak_model.pi, train_model.q])
        # Get pi and q values for actions taken
        f_i = get_by_index(f, A)
        q_i = get_by_index(q, A)

        # Compute ratios for importance truncation
        rho = f / (MU + eps)
        rho_i = get_by_index(rho, A)

        # Calculate Q_retrace targets
        qret = q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma)

        # Calculate losses
        # Entropy
        entropy = tf.reduce_mean(cat_entropy_softmax(f))

        # Policy Graident loss, with truncated importance sampling & bias correction
        v = strip(v, nenvs, nsteps, True)
        check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4)
        check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2)

        # Truncated importance sampling
        adv = qret - v
        logf = tf.log(f_i + eps)
        gain_f = logf * tf.stop_gradient(
            adv * tf.minimum(c, rho_i))  # [nenvs * nsteps]
        loss_f = -tf.reduce_mean(gain_f)

        # Bias correction for the truncation
        adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1])
                  )  # [nenvs * nsteps, nact]
        logf_bc = tf.log(f + eps)  # / (f_old + eps)
        check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]] * 2)
        gain_bc = tf.reduce_sum(
            logf_bc *
            tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f),
            axis=1)  #IMP: This is sum, as expectation wrt f
        loss_bc = -tf.reduce_mean(gain_bc)

        loss_policy = loss_f + loss_bc

        # Value/Q function loss, and explained variance
        check_shape([qret, q_i], [[nenvs * nsteps]] * 2)
        ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]),
                                  tf.reshape(qret, [nenvs, nsteps]))
        loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i) * 0.5)

        # Net loss
        check_shape([loss_policy, loss_q, entropy], [[]] * 3)
        loss = loss_policy + q_coef * loss_q - ent_coef * entropy

        if trust_region:
            g = tf.gradients(-(loss_policy - ent_coef * entropy) * nsteps *
                             nenvs, f)  #[nenvs * nsteps, nact]
            # k = tf.gradients(KL(f_pol || f), f)
            k = -f_pol / (
                f + eps
            )  #[nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f
            k_dot_g = tf.reduce_sum(k * g, axis=-1)
            adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) /
                             (tf.reduce_sum(tf.square(k), axis=-1) +
                              eps))  #[nenvs * nsteps]

            # Calculate stats (before doing adjustment) for logging.
            avg_norm_k = avg_norm(k)
            avg_norm_g = avg_norm(g)
            avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g))
            avg_norm_adj = tf.reduce_mean(tf.abs(adj))

            g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k
            grads_f = -g / (
                nenvs * nsteps
            )  # These are turst region adjusted gradients wrt f ie statistics of policy pi
            grads_policy = tf.gradients(f, params, grads_f)
            grads_q = tf.gradients(loss_q * q_coef, params)
            grads = [
                gradient_add(g1, g2, param)
                for (g1, g2, param) in zip(grads_policy, grads_q, params)
            ]

            avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs)
            norm_grads_q = tf.global_norm(grads_q)
            norm_grads_policy = tf.global_norm(grads_policy)
        else:
            grads = tf.gradients(loss, params)

        if max_grad_norm is not None:
            grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=rprop_alpha,
                                            epsilon=rprop_epsilon)
        _opt_op = trainer.apply_gradients(grads)

        # so when you call _train, you first do the gradient step, then you apply ema
        with tf.control_dependencies([_opt_op]):
            _train = tf.group(ema_apply_op)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        # Ops/Summaries to run, and their names for logging
        run_ops = [
            _train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev,
            norm_grads
        ]
        names_ops = [
            'loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc',
            'explained_variance', 'norm_grads'
        ]
        if trust_region:
            run_ops = run_ops + [
                norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k,
                avg_norm_g, avg_norm_k_dot_g, avg_norm_adj
            ]
            names_ops = names_ops + [
                'norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f',
                'avg_norm_k', 'avg_norm_g', 'avg_norm_k_dot_g', 'avg_norm_adj'
            ]

        def train(obs, actions, rewards, dones, mus, states, masks, steps):
            cur_lr = lr.value_steps(steps)
            td_map = {
                train_model.X: obs,
                polyak_model.X: obs,
                A: actions,
                R: rewards,
                D: dones,
                MU: mus,
                LR: cur_lr
            }
            if states != []:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
                td_map[polyak_model.S] = states
                td_map[polyak_model.M] = masks
            return names_ops, sess.run(run_ops, td_map)[1:]  # strip off _train

        def save(save_path):
            ps = sess.run(params)
            make_path(osp.dirname(save_path))
            joblib.dump(ps, save_path)

        self.train = train
        self.save = save
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.initial_state = step_model.initial_state
        tf.global_variables_initializer().run(session=sess)
예제 #5
0
  def __init__(self,
               policy,
               ob_space,
               ac_space,
               nenvs,
               total_timesteps,
               nprocs=32,
               nscripts=16,
               nsteps=20,
               nstack=4,
               ent_coef=0.1,
               vf_coef=0.5,
               vf_fisher_coef=1.0,
               lr=0.25,
               max_grad_norm=0.001,
               kfac_clip=0.001,
               lrschedule='linear',
               alpha=0.99,
               epsilon=1e-5):
    config = tf.ConfigProto(
        allow_soft_placement=True,
        intra_op_parallelism_threads=nprocs,
        inter_op_parallelism_threads=nprocs)
    config.gpu_options.allow_growth = True
    self.sess = sess = tf.Session(config=config)
    nsml.bind(sess=sess)
    #nact = ac_space.n
    nbatch = nenvs * nsteps
    A = tf.placeholder(tf.int32, [nbatch])

    XY0 = tf.placeholder(tf.int32, [nbatch])
    XY1 = tf.placeholder(tf.int32, [nbatch])

    # ADV == TD_TARGET - values
    ADV = tf.placeholder(tf.float32, [nbatch])
    TD_TARGET = tf.placeholder(tf.float32, [nbatch])
    PG_LR = tf.placeholder(tf.float32, [])
    VF_LR = tf.placeholder(tf.float32, [])

    self.model = step_model = policy(
        sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
    self.model2 = train_model = policy(
        sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True)

    # Policy 1 : Base Action : train_model.pi label = A

    script_mask = tf.concat(
        [
            tf.zeros([nscripts * nsteps, 1]),
            tf.ones([(nprocs - nscripts) * nsteps, 1])
        ],
        axis=0)

    pi = train_model.pi
    pac_weight = script_mask * (tf.nn.softmax(pi) - 1.0) + 1.0
    pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(A, depth=3), axis=1)
    neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=pi, labels=A)
    neglogpac *= tf.stop_gradient(pac_weight)

    inv_A = 1.0 - tf.cast(A, tf.float32)

    xy0_mask = tf.cast(A, tf.float32)
    xy1_mask = tf.cast(A, tf.float32)

    condition0 = tf.equal(xy0_mask, 2)
    xy0_mask = tf.where(condition0, tf.ones(tf.shape(xy0_mask)), xy0_mask)
    xy0_mask = 1.0 - xy0_mask

    condition1 = tf.equal(xy1_mask, 2)
    xy1_mask = tf.where(condition1, tf.zeros(tf.shape(xy1_mask)), xy1_mask)

    # One hot representation of chosen marine.
    # [batch_size, 2]
    pi_xy0 = train_model.pi_xy0
    pac_weight = script_mask * (tf.nn.softmax(pi_xy0) - 1.0) + 1.0
    pac_weight = tf.reduce_sum(
        pac_weight * tf.one_hot(XY0, depth=1024), axis=1)

    logpac_xy0 = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=pi_xy0, labels=XY0)
    logpac_xy0 *= tf.stop_gradient(pac_weight)
    logpac_xy0 *= tf.cast(xy0_mask, tf.float32)

    pi_xy1 = train_model.pi_xy1
    pac_weight = script_mask * (tf.nn.softmax(pi_xy1) - 1.0) + 1.0
    pac_weight = tf.reduce_sum(
        pac_weight * tf.one_hot(XY0, depth=1024), axis=1)

    # 1D? 2D?
    logpac_xy1 = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=pi_xy1, labels=XY1)
    logpac_xy1 *= tf.stop_gradient(pac_weight)
    logpac_xy1 *= tf.cast(xy1_mask, tf.float32)

    pg_loss = tf.reduce_mean(ADV * neglogpac)
    pg_loss_xy0 = tf.reduce_mean(ADV * logpac_xy0)
    pg_loss_xy1 = tf.reduce_mean(ADV * logpac_xy1)

    vf_ = tf.squeeze(train_model.vf)

    vf_r = tf.concat(
        [
            tf.ones([nscripts * nsteps, 1]),
            tf.zeros([(nprocs - nscripts) * nsteps, 1])
        ],
        axis=0) * TD_TARGET
    vf_masked = vf_ * script_mask + vf_r

    #vf_mask[0:nscripts * nsteps] = R[0:nscripts * nsteps]

    vf_loss = tf.reduce_mean(mse(vf_masked, TD_TARGET))
    entropy_a = tf.reduce_mean(cat_entropy(train_model.pi))
    entropy_xy0 = tf.reduce_mean(cat_entropy(train_model.pi_xy0))
    entropy_xy1 = tf.reduce_mean(cat_entropy(train_model.pi_xy1))
    entropy = entropy_a + entropy_xy0 + entropy_xy1

    loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

    params = find_trainable_variables("model")
    grads = tf.gradients(loss, params)
    if max_grad_norm is not None:
      grads, _ = tf.clip_by_global_norm(grads, max_grad_norm)
    grads = list(zip(grads, params))
    trainer = tf.train.RMSPropOptimizer(
        learning_rate=lr, decay=alpha, epsilon=epsilon)
    _train = trainer.apply_gradients(grads)

    self.logits = logits = train_model.pi

    # xy0

    self.params_common = params_common = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/common')
    self.params_xy0 = params_xy0 = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES,
        scope='model/xy0') + params_common

    train_loss_xy0 = pg_loss_xy0 - entropy * ent_coef + vf_coef * vf_loss

    self.grads_check_xy0 = grads_xy0 = tf.gradients(
        train_loss_xy0, params_xy0)
    if max_grad_norm is not None:
      grads_xy0, _ = tf.clip_by_global_norm(grads_xy0, max_grad_norm)

    grads_xy0 = list(zip(grads_xy0, params_xy0))
    trainer_xy0 = tf.train.RMSPropOptimizer(
        learning_rate=lr, decay=alpha, epsilon=epsilon)
    _train_xy0 = trainer_xy0.apply_gradients(grads_xy0)

    # xy1

    self.params_xy1 = params_xy1 = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES,
        scope='model/xy1') + params_common

    train_loss_xy1 = pg_loss_xy1 - entropy * ent_coef + vf_coef * vf_loss

    self.grads_check_xy1 = grads_xy1 = tf.gradients(
        train_loss_xy1, params_xy1)
    if max_grad_norm is not None:
      grads_xy1, _ = tf.clip_by_global_norm(grads_xy1, max_grad_norm)

    grads_xy1 = list(zip(grads_xy1, params_xy1))
    trainer_xy1 = tf.train.RMSPropOptimizer(
        learning_rate=lr, decay=alpha, epsilon=epsilon)
    _train_xy1 = trainer_xy1.apply_gradients(grads_xy1)

    self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

    def train(obs, states, td_targets, masks, actions, xy0, xy1, values):
      advs = td_targets - values
      for step in range(len(obs)):
        cur_lr = self.lr.value()

      td_map = {
          train_model.X: obs,
          A: actions,
          XY0: xy0,
          XY1: xy1,
          ADV: advs,
          TD_TARGET: td_targets,
          PG_LR: cur_lr
      }
      if states != []:
        td_map[train_model.S] = states
        td_map[train_model.M] = masks

      policy_loss, value_loss, policy_entropy, _, \
      policy_loss_xy0, policy_entropy_xy0, _, \
      policy_loss_xy1, policy_entropy_xy1, _ = sess.run(
          [pg_loss, vf_loss, entropy, _train,
           pg_loss_xy0, entropy_xy0, _train_xy0,
           pg_loss_xy1, entropy_xy1, _train_xy1],
          td_map)
      return policy_loss, value_loss, policy_entropy, \
             policy_loss_xy0, policy_entropy_xy0, \
             policy_loss_xy1, policy_entropy_xy1

    def save(save_path):
      ps = sess.run(params)
      joblib.dump(ps, save_path)

    def load(load_path):
      loaded_params = joblib.load(load_path)
      restores = []
      for p, loaded_p in zip(params, loaded_params):
        restores.append(p.assign(loaded_p))
      sess.run(restores)

    self.train = train
    self.save = save
    self.load = load
    self.train_model = train_model
    self.step_model = step_model
    self.step = step_model.step
    self.value = step_model.value
    self.initial_state = step_model.initial_state
    print("global_variables_initializer start")
    tf.global_variables_initializer().run(session=sess)
    print("global_variables_initializer complete")
예제 #6
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 nsteps,
                 nstack,
                 num_procs,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lrschedule='linear'):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=num_procs,
                                inter_op_parallelism_threads=num_procs)
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        nact = ac_space.n
        nbatch = nenvs * nsteps

        writter = tf.summary.FileWriter(
            "/tmp/a2c_demo/1")  # Change for SAT: this is to use tensorBoard

        A = tf.placeholder(
            tf.int32, [nbatch])  # Comments by Fei: this must be the action
        ADV = tf.placeholder(
            tf.float32,
            [nbatch])  # Comments by Fei: this must be the advantage
        R = tf.placeholder(
            tf.float32, [nbatch])  # Comments by Fei: this must be the reward
        LR = tf.placeholder(
            tf.float32, [])  # Comments by Fei: this must be the learning rate

        step_model = policy(sess,
                            ob_space,
                            ac_space,
                            nenvs,
                            1,
                            nstack,
                            reuse=False)
        train_model = policy(sess,
                             ob_space,
                             ac_space,
                             nenvs,
                             nsteps,
                             nstack,
                             reuse=True)

        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.pi,
            labels=A)  # Comments by Fei: pi is nbatch * nact
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=alpha,
                                            epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: cur_lr
            }
            if states != []:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)
            # writter.add_graph(sess.graph)
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            make_path(save_path)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
예제 #7
0
    def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs,
            ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4,
            alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear', logdir=None):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=num_procs,
                                inter_op_parallelism_threads=num_procs)
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        nact = ac_space.n
        nbatch = nenvs*nsteps

        ADV = tf.placeholder(tf.float32, [None])
        R = tf.placeholder(tf.float32, [None])
        LR = tf.placeholder(tf.float32, [])

        step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
        train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True)

        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=train_model.a0)
        entropy = tf.reduce_sum(cat_entropy(train_model.pi))
        params = find_trainable_variables("model")
        tf.summary.histogram("vf", train_model.vf)
        tf.summary.histogram("R", R)

        if train_model.relaxed:
            pg_loss = tf.constant(0.0)
            oh_A = tf.one_hot(train_model.a0, ac_space.n)

            params = find_trainable_variables("model")
            policy_params = [v for v in params if "pi" in v.name]
            vf_params = [v for v in params if "vf" in v.name]
            entropy_grads = tf.gradients(entropy, policy_params)

            ddiff_loss = tf.reduce_sum(train_model.vf - train_model.vf_t)
            ddiff_grads = tf.gradients(ddiff_loss, policy_params)

            sm = tf.nn.softmax(train_model.pi)
            dlogp_dpi = oh_A * (1. - sm) + (1. - oh_A) * (-sm)
            pi_grads = -((tf.expand_dims(R, 1) - train_model.vf_t) * dlogp_dpi)
            pg_grads = tf.gradients(train_model.pi, policy_params, grad_ys=pi_grads)
            pg_grads = [pg - dg for pg, dg in zip(pg_grads, ddiff_grads)]

            pi_param_grads = tf.gradients(train_model.pi, policy_params, grad_ys=pi_grads)

            cv_grads = tf.concat([tf.reshape(p, [-1]) for p in pg_grads], 0)
            cv_grad_splits = tf.reduce_sum(tf.square(cv_grads))
            vf_loss = cv_grad_splits * vf_coef

            cv_grads = tf.gradients(vf_loss, vf_params)

            policy_grads = []
            for e_grad, p_grad, param in zip(entropy_grads, pg_grads, policy_params):
                grad = -e_grad * ent_coef + p_grad
                policy_grads.append(grad)
            grad_dict = {}

            for g, v in list(zip(policy_grads, policy_params))+list(zip(cv_grads, vf_params)):
                grad_dict[v] = g

            grads = [grad_dict[v] for v in params]
            print(grads)


        else:
            pg_loss = tf.reduce_sum((tf.stop_gradient(R) - tf.stop_gradient(train_model.vf)) * neglogpac)
            policy_params = [v for v in params if "pi" in v.name]
            pg_grads = tf.gradients(pg_loss, policy_params)

            vf_loss = tf.reduce_sum(mse(tf.squeeze(train_model.vf), R))
            loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
            grads = tf.gradients(loss, params)

        grads = list(zip(grads, params))

        ema = tf.train.ExponentialMovingAverage(.99)
        all_policy_grads = tf.concat([tf.reshape(g, [-1]) for g in pg_grads], 0)
        all_policy_grads_sq = tf.square(all_policy_grads)
        apply_mean_op = ema.apply([all_policy_grads, all_policy_grads_sq])
        em_mean = ema.average(all_policy_grads)
        em_mean_sq = ema.average(all_policy_grads_sq)
        em_var = em_mean_sq - tf.square(em_mean)
        em_log_var = tf.log(em_var + 1e-20)
        mlgv = tf.reduce_mean(em_log_var)

        for g, v in grads:
            print(v.name, g)
            tf.summary.histogram(v.name, v)
            tf.summary.histogram(v.name+"_grad", g)

        self.sum_op = tf.summary.merge_all()
        self.writer = tf.summary.FileWriter(logdir)

        trainer = tf.train.AdamOptimizer(learning_rate=LR, beta2=.99999)
        with tf.control_dependencies([apply_mean_op]):
            _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
        self._step = 0
        def train(obs, states, rewards, masks, u1, u2, values, summary=False):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {
                train_model.X:obs, train_model.U1:u1, train_model.U2:u2,
                ADV:advs, R:rewards, LR:cur_lr
            }
            if states != []:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            if summary:
                sum_str, policy_loss, value_loss, policy_entropy, lv, _ = sess.run(
                    [self.sum_op, pg_loss, vf_loss, entropy, mlgv, _train],
                    td_map
                )
                self.writer.add_summary(sum_str, self._step)
            else:
                policy_loss, value_loss, policy_entropy, lv, _ = sess.run(
                    [pg_loss, vf_loss, entropy, mlgv, _train],
                    td_map
                )
            self._step += 1
            return policy_loss, value_loss, policy_entropy, lv

        def save(save_path):
            ps = sess.run(params)
            make_path(save_path)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
예제 #8
0
 def load_variables_from_another_model(self, another_model_name):
     sess = tf_util.get_session()
     params = find_trainable_variables(self.model_name)
     another_params = find_trainable_variables(another_model_name)
     for pair in zip(params, another_params):
         sess.run(tf.assign(pair[0], pair[1]))
예제 #9
0
    def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20,
                 ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
                 kfac_clip=0.001, lrschedule='linear'):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=nprocs,
                                inter_op_parallelism_threads=nprocs)
        config.gpu_options.allow_growth = True
        self.sess = sess = tf.Session(config=config)
        nact = ac_space.n
        nbatch = nenvs * nsteps
        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        PG_LR = tf.placeholder(tf.float32, [])
        VF_LR = tf.placeholder(tf.float32, [])

        self.model = step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
        self.model2 = train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True)

        logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
        self.logits = logits = train_model.pi

        ##training loss
        pg_loss = tf.reduce_mean(ADV*logpac)
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        pg_loss = pg_loss - ent_coef * entropy
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        train_loss = pg_loss + vf_coef * vf_loss


        ##Fisher loss construction
        self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac)
        sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf))
        self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2))
        self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss

        self.params=params = find_trainable_variables("model")

        self.grads_check = grads = tf.gradients(train_loss,params)

        with tf.device('/gpu:0'):
            self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\
                momentum=0.9, kfac_update=1, epsilon=0.01,\
                stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm)

            update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params)
            train_op, q_runner = optim.apply_gradients(list(zip(grads,params)))
        self.q_runner = q_runner
        self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = self.lr.value()

            td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, PG_LR:cur_lr}
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, train_op],
                td_map
            )
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)



        self.train = train
        self.save = save
        self.load = load
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        tf.global_variables_initializer().run(session=sess)
예제 #10
0
    def __init__(self, policy, ob_space, ac_space, nenvs, master_ts = 1, worker_ts = 30,
            ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4, cell = 256,
            alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear',
            algo='regular', beta=1e-3):

        print('Create Session')
        gpu_options = tf.GPUOptions(allow_growth=True)
        sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
        nact = ac_space.n
        nbatch = nenvs*master_ts*worker_ts

        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        step_model = policy(sess, ob_space, ac_space, nenvs, 1, 1, cell = cell, model='step_model', algo=algo)
        train_model = policy(sess, ob_space, ac_space, nbatch, master_ts, worker_ts, model='train_model', algo=algo)
        print('model_setting_done')

        #loss construction
        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.wpi, labels=A)
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.wvf), R))
        entropy = tf.reduce_mean(cat_entropy(train_model.wpi))
        pg_loss = pg_loss - entropy * ent_coef
        print('algo: ', algo, 'max_grad_norm: ', str(max_grad_norm))
        try:
            if algo == 'regular':
                loss = pg_loss + vf_coef * vf_loss
            elif algo == 'VIB':
                '''
                implement VIB here, apart from the vf_loss and pg_loss, there should be a third loss,
                the kl_loss = ds.kl_divergence(model.encoding, prior), where prior is a Gaussian distribution with mu=0, std=1
                the final loss should be pg_loss + vf_coef * vf_loss + beta*kl_loss
                '''
                prior = ds.Normal(0.0, 1.0)
                kl_loss = tf.reduce_mean(ds.kl_divergence(train_model.encoding, prior))
                loss = pg_loss + vf_coef * vf_loss + beta*kl_loss
                # pass
            else:
                raise Exception('Algorithm not exists')
        except Exception as e:
            print(e)

        grads, global_norm = grad_clip(loss, max_grad_norm, ['model'])
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(wobs, whs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(whs)):
                cur_lr = lr.value()

            td_map = {train_model.wX:wobs, A:actions, ADV:advs, R:rewards, LR:cur_lr}
            if states is not None:
                td_map[train_model.wS] = states
                td_map[train_model.wM] = masks

            '''
            you can add and run additional loss for VIB here for debugging, such as kl_loss
            '''
            tloss, value_loss, policy_loss, policy_entropy, _ = sess.run(
                [loss, vf_loss, pg_loss, entropy, _train],
                feed_dict=td_map
            )
            return tloss, value_loss, policy_loss, policy_entropy

        params = find_trainable_variables("model")
        def save(save_path):
            ps = sess.run(params)
            make_path(osp.dirname(save_path))
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.wvalue
        self.get_wh = step_model.get_wh
        self.initial_state = step_model.w_initial_state
        self.train = train
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
예제 #11
0
    def __init__(self,
                 policy,
                 env,
                 nsteps,
                 ent_coef=1,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 fisher_matrix=None,
                 star_param=None,
                 lam=None,
                 batch_size=None,
                 fisher_matrix2=None,
                 star_param2=None):

        sess = tf_util.get_session()
        nbatch = batch_size

        with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE):

            # train_model is used to train our network
            train_model = policy(nbatch, 1, sess)
            eval_model = policy(1, 1, sess)

        # Update parameters using lossa.
        # 1. Get the model parameters
        params = find_trainable_variables("a2c_model")
        print(len(params))

        ewc_loss = 0
        ewc_per_layer = []
        j = 0
        #star_param是load进去的array|params是模型的参数,也是tensor
        for v in range(len(params) - 4):
            if v % 2 == 0:
                C = star_param[v].shape[-1]
                W = tf.transpose(tf.reshape(params[v] - star_param[v],
                                            [-1, C]))
                right = tf.reshape(
                    tf.transpose(
                        tf.matmul(tf.matmul(fisher_matrix[j + 1], W),
                                  fisher_matrix[j])), [-1, 1])
                W_ = tf.reshape(params[v] - star_param[v], [1, -1])

                ewc_loss += tf.matmul(W_, right)
                ewc_per_layer.append(tf.matmul(W_, right))
                j = j + 2
            else:
                B = tf.reshape(params[v] - star_param[v], [-1, 1])
                right_B = tf.matmul(fisher_matrix[j], B)
                B_ = tf.reshape(params[v] - star_param[v], [1, -1])

                ewc_loss += tf.matmul(B_, right_B)
                ewc_per_layer.append(tf.matmul(B_, right_B))

                j += 1

        ewc_loss2 = 0
        j = 0
        for v in range(len(params) - 4):
            if v % 2 == 0:
                C2 = star_param2[v].shape[-1]
                W2 = tf.transpose(
                    tf.reshape(params[v] - star_param2[v], [-1, C2]))
                right2 = tf.reshape(
                    tf.transpose(
                        tf.matmul(tf.matmul(fisher_matrix2[j + 1], W2),
                                  fisher_matrix2[j])), [-1, 1])
                W_2 = tf.reshape(params[v] - star_param2[v], [1, -1])

                ewc_loss2 += tf.matmul(W_2, right2)
                j = j + 2
            else:
                B2 = tf.reshape(params[v] - star_param2[v], [-1, 1])
                right_B2 = tf.matmul(fisher_matrix2[j], B2)
                B_2 = tf.reshape(params[v] - star_param2[v], [1, -1])

                ewc_loss2 += tf.matmul(B_2, right_B2)
                j += 1

        loss1 = ewc_loss * (lam / 2)
        loss2 = ewc_loss2 * (lam / 2)
        loss = loss1 + loss2

        # 2. Calculate the gradients
        trainer = tf.train.AdamOptimizer(learning_rate=1e-3)
        # grads_and_var = trainer.compute_gradients(loss, params)
        # grads, var = zip(*grads_and_var)
        # grads, _grad_norm = tf.clip_by_global_norm(grads, 0.5)
        #
        # grads_and_var = list(zip(grads, var))

        grads = tf.gradients(loss, params)

        grads_and_var = list(zip(grads, params))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        # 3. Make op for one policy and value update step of A2C
        # trainer = tf.train.RMSPropOptimizer(learning_rate=7e-4, decay=alpha, epsilon=epsilon)

        # trainer = tf.train.MomentumOptimizer(learning_rate=0.001,momentum=0.9,use_nesterov=True)
        _train = trainer.apply_gradients(grads_and_var)

        #
        #
        # grads2 = tf.gradients(loss2,params)
        # grads2 = list(zip(grads2, params))
        # trainer2 = tf.train.RMSPropOptimizer(learning_rate=7e-4, decay=alpha, epsilon=epsilon)
        # trainer2 = tf.train.AdamOptimizer(learning_rate=1e-4)
        # trainer2 = tf.train.MomentumOptimizer(learning_rate=0.001, momentum=0.9, use_nesterov=True)
        # _train2 = trainer2.apply_gradients(grads2)

        def train():

            td_map = {train_model.keep_prob: 1.0}

            ewc1, ewc2, l, _ = sess.run([loss1, loss2, loss, _train], td_map)

            return ewc1, ewc2, l

            # KL,ewc,_ = sess.run([KL_loss,ewc_loss,_train],td_map)

            # return KL, ewc

        def creat_star_param():
            star_list = []
            for i in params[:-2]:
                star_list.append(star_param[params[i].name])
            return star_list

        self.creat_star_param = creat_star_param
        self.train = train
        self.train_model = train_model
        self.act = eval_model.step
        self.act2 = eval_model.step2
        self.save = functools.partial(tf_util.save_variables, sess=sess)
        self.load = functools.partial(tf_util.load_variables, sess=sess)
        tf.global_variables_initializer().run(session=sess)
예제 #12
0
    def __init__(self,
                 policy,
                 nenvs,
                 total_timesteps,
                 nprocs=32,
                 nsteps=20,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 vf_fisher_coef=1.0,
                 lr=0.25,
                 max_grad_norm=0.5,
                 kfac_clip=0.001,
                 lrschedule='linear',
                 is_async=True):

        sess = get_session()
        nbatch = nenvs * nsteps

        with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE):
            # step_model is used for sampling
            step_model = policy(nenvs, 1, sess)

            # train_model is used to train our network
            train_model = policy(nbatch, nsteps, sess)
            eval_model = policy(1, 1, sess)

        # A = train_model.pdtype.sample_placeholder([None])
        # A = tf.placeholder(step_model.action.dtype, step_model.action.shape)
        probs = tf.nn.softmax(step_model.pi)
        class_ind = tf.to_int32(tf.multinomial(tf.log(probs), 1)[0][0])
        self.pg_fisher = pg_fisher_loss = tf.log(probs[0, class_ind])

        ##Fisher loss construction
        # self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(neglogpac)
        # sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf))
        # self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2))
        # self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss
        self.joint_fisher = joint_fisher_loss = pg_fisher_loss

        self.params = params = find_trainable_variables("a2c_model")

        with tf.device('/gpu:0'):
            self.optim = optim = kfac.KfacOptimizer()

            # update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params)
            stats = optim.compute_and_apply_stats(joint_fisher_loss,
                                                  var_list=params[:-4])

        def compute_fisher(obs):
            # action = action[:, np.newaxis]
            td_map = {step_model.X: obs, step_model.keep_prob: 1.0}

            fisher = sess.run(stats, td_map)
            return fisher

        self.compute_fisher = compute_fisher
        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        tf.global_variables_initializer().run(session=sess)
예제 #13
0
파일: acktr.py 프로젝트: MrGoogol/baselines
    def __init__(self, policy, ob_space, ac_space, nenvs,total_timesteps, nprocs=32, nsteps=20,
                 ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5,
                 kfac_clip=0.001, lrschedule='linear', is_async=True):

        self.sess = sess = get_session()
        nbatch = nenvs * nsteps
        with tf.variable_scope('acktr_model', reuse=tf.AUTO_REUSE):
            self.model = step_model = policy(nenvs, 1, sess=sess)
            self.model2 = train_model = policy(nenvs*nsteps, nsteps, sess=sess)

        A = train_model.pdtype.sample_placeholder([None])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        PG_LR = tf.placeholder(tf.float32, [])
        VF_LR = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)
        self.logits = train_model.pi

        ##training loss
        pg_loss = tf.reduce_mean(ADV*neglogpac)
        entropy = tf.reduce_mean(train_model.pd.entropy())
        pg_loss = pg_loss - ent_coef * entropy
        vf_loss = tf.losses.mean_squared_error(tf.squeeze(train_model.vf), R)
        train_loss = pg_loss + vf_coef * vf_loss


        ##Fisher loss construction
        self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(neglogpac)
        sample_net = train_model.vf + tf.random_normal(tf.shape(train_model.vf))
        self.vf_fisher = vf_fisher_loss = - vf_fisher_coef*tf.reduce_mean(tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2))
        self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss

        self.params=params = find_trainable_variables("acktr_model")

        self.grads_check = grads = tf.gradients(train_loss,params)

        with tf.device('/gpu:0'):
            self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\
                momentum=0.9, kfac_update=1, epsilon=0.01,\
                stats_decay=0.99, is_async=is_async, cold_iter=10, max_grad_norm=max_grad_norm)

            # update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params)
            optim.compute_and_apply_stats(joint_fisher_loss, var_list=params)
            train_op, q_runner = optim.apply_gradients(list(zip(grads,params)))
        self.q_runner = q_runner
        self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = self.lr.value()

            td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, PG_LR:cur_lr, VF_LR:cur_lr}
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, train_op],
                td_map
            )
            return policy_loss, value_loss, policy_entropy


        self.train = train
        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        tf.global_variables_initializer().run(session=sess)
예제 #14
0
파일: a2c.py 프로젝트: IcarusTan/baselines
    def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs,
            ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4,
            alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=num_procs,
                                inter_op_parallelism_threads=num_procs)
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        nact = ac_space.n
        nbatch = nenvs*nsteps

        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
        train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True)

        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr}
            if states != []:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train],
                td_map
            )
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            make_path(save_path)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
예제 #15
0
파일: a2c.py 프로젝트: NeteaseFuxiRL/asf
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 nsteps,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lrschedule='linear',
                 param=None):

        sess = tf_util.make_session()
        nact = ac_space.n
        nbatch = nenvs * nsteps

        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        step_model = policy(sess,
                            ob_space,
                            ac_space,
                            nenvs,
                            1,
                            reuse=False,
                            param=param)
        train_model = policy(sess,
                             ob_space,
                             ac_space,
                             nenvs * nsteps,
                             nsteps,
                             reuse=True,
                             param=param)

        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.pi, labels=A)
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=alpha,
                                            epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: cur_lr
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            make_path(save_path)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
예제 #16
0
파일: acktr.py 프로젝트: yoniosin/A2C
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 total_timesteps,
                 nprocs=32,
                 nsteps=20,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 vf_fisher_coef=1.0,
                 lr=0.25,
                 max_grad_norm=0.5,
                 kfac_clip=0.001,
                 lrschedule='linear',
                 is_async=True):

        self.sess = sess = get_session()
        nbatch = nenvs * nsteps
        with tf.variable_scope('acktr_model', reuse=tf.AUTO_REUSE):
            self.model = step_model = policy(nenvs, 1, sess=sess)
            self.model2 = train_model = policy(nenvs * nsteps,
                                               nsteps,
                                               sess=sess)

        A = train_model.pdtype.sample_placeholder([None])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        PG_LR = tf.placeholder(tf.float32, [])
        VF_LR = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)
        self.logits = train_model.pi

        ##training loss
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        entropy = tf.reduce_mean(train_model.pd.entropy())
        pg_loss = pg_loss - ent_coef * entropy
        vf_loss = tf.losses.mean_squared_error(tf.squeeze(train_model.vf), R)
        train_loss = pg_loss + vf_coef * vf_loss

        ##Fisher loss construction
        self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(neglogpac)
        sample_net = train_model.vf + tf.random_normal(tf.shape(
            train_model.vf))
        self.vf_fisher = vf_fisher_loss = -vf_fisher_coef * tf.reduce_mean(
            tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2))
        self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss

        self.params = params = find_trainable_variables("acktr_model")

        self.grads_check = grads = tf.gradients(train_loss, params)

        with tf.device('/gpu:0'):
            self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\
                momentum=0.9, kfac_update=1, epsilon=0.01,\
                stats_decay=0.99, is_async=is_async, cold_iter=10, max_grad_norm=max_grad_norm)

            # update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss, var_list=params)
            optim.compute_and_apply_stats(joint_fisher_loss, var_list=params)
            train_op, q_runner = optim.apply_gradients(list(zip(grads,
                                                                params)))
        self.q_runner = q_runner
        self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = self.lr.value()

            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                PG_LR: cur_lr,
                VF_LR: cur_lr
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, train_op], td_map)
            return policy_loss, value_loss, policy_entropy

        self.train = train
        self.save = functools.partial(save_variables, sess=sess)
        self.load = functools.partial(load_variables, sess=sess)
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        tf.global_variables_initializer().run(session=sess)
예제 #17
0
    def __init__(self, policy, p, has_state):
        """
        policy : Internal Policy model such as  SnakeModel.CNNPolicy
        p : Hyperparameters required for training
        """
        sess = tf_util.make_session()
        # Tensorflow model initiallization
        step_model = policy(sess=sess,
                            p=p,
                            train_phase=False,
                            has_state=has_state)  # Deploy model settings
        train_model = policy(sess=sess,
                             p=p,
                             train_phase=True,
                             has_state=has_state)  # Training model settings
        saver = tf.train.Saver()

        #Step 2 : Initialize the training parameters
        A = tf.placeholder(tf.int32, [p.N_BATCH])
        ADV = tf.placeholder(tf.float32, [p.N_BATCH])
        R = tf.placeholder(tf.float32, [p.N_BATCH])
        LR = tf.placeholder(tf.float32, [])

        #Step 3 : Define the loss Function
        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.pi, labels=A)  #
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        loss = pg_loss - entropy * p.ENTROPY_COEFF + vf_loss * p.VALUE_FUNC_COEFF

        #Step 4 : Define the loss optimizer
        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if p.MAX_GRAD_NORM is not None:
            grads, grad_norm = tf.clip_by_global_norm(
                grads, p.MAX_GRAD_NORM
            )  # Clipping the gradients to protect learned weights
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=p.RMS_DECAY,
                                            epsilon=p.EPSILON)
        _train = trainer.apply_gradients(
            grads)  # This is the variable which will be used
        lr = Scheduler(v=p.LEARNING_RATE,
                       nvalues=p.N_TIMESTEPS,
                       schedule=p.LEARNING_RATE_SCHEDULE
                       )  # Learning rate changes linearly or as per arguments

        # Step 5 : Write down the summary parameters to be used
        writer = tf.summary.FileWriter(p.LOG_PATH)  #summary writer

        def train(obs, rewards, masks, actions, values, states):
            """
            obs     : batch x n x m x 1 snake matrix
            rewards : batch x 1 rewards corrosponding to action 
            actions : batch x 1 discrete action taken
            values  : batch x 1 output of value function during the training process  
            """
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {
                train_model.X: obs,
                train_model.S: states,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: cur_lr
            }
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            #ps = sess.run(params)
            #make_path(save_path)
            #joblib.dump(ps, save_path)
            saver.save(sess, save_path)

        def load(load_path):
            #loaded_params = joblib.load(load_path)
            #restores = []
            #for p, loaded_p in zip(params, loaded_params):
            #    restores.append(p.assign(loaded_p))
            #ps = sess.run(restores)
            saver.restore(sess, load_path)

        def add_scalar_summary(tag, value, step):
            summary = tf.Summary(
                value=[tf.Summary.Value(tag=tag, simple_value=value)])
            writer.add_summary(summary, step)

        # Expose the user to closure functions
        self.train = train
        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.hidden_value = step_model.hidden_value
        self.initial_state = step_model.initial_state
        self.add_scalar_summary = add_scalar_summary
        self.save = save
        self.load = load
        # Initialize global variables and add tf graph
        tf.global_variables_initializer().run(session=sess)
        writer.add_graph(tf.get_default_graph())  #write graph
예제 #18
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 nsteps,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 lambda_dist=0.01,
                 total_timesteps=None,
                 lrschedule='linear'):

        sess = tf.get_default_session()
        nact = ac_space.n
        nbatch = nenvs * nsteps

        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        config = Config()

        act_model = policy(config)
        config.reuse = True
        train_model = policy(config)

        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.logits, labels=A)
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        entropy = tf.reduce_mean(cat_entropy(train_model.logits))

        aux_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.rp_logits, labels=A)
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef + aux_loss * lambda_dist

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=alpha,
                                            epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        saver = tf.train.Saver()

        def train(obs, rs, rr, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: cur_lr,
                train_model.inputs_s: rs,
                train_model.inputs_r: rr
            }

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            saver.save(sess, save_path + 'model.ckpt')

        def load(load_path):
            saver.restore(sess, load_path + 'model.ckpt')

        self.train = train
        self.train_model = train_model
        self.act_model = act_model
        self.act = act_model.act
        self.value = act_model.value
        self.save = save
        self.load = load
예제 #19
0
파일: a2c.py 프로젝트: yoniosin/A2C
    def __init__(self,
                 policy,
                 env,
                 nsteps,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lrschedule='linear',
                 network='cnn',
                 prio_args=None):

        self.prio_args = prio_args
        sess = tf_util.get_session()
        nenvs = self.get_active_envs(env)

        nbatch = nenvs * nsteps

        with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE):
            # step_model is used for sampling
            step_model = policy(nenvs, 1, sess)

            # train_model is used to train our network
            train_model = policy(nbatch, nsteps, sess)
            # our TD evaluating network

        A = tf.placeholder(train_model.action.dtype, train_model.action.shape)
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        # Calculate the loss
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Policy loss
        neglogpac = train_model.pd.neglogp(A)
        # L = A(s,a) * -logpi(a|s)
        pg_loss = tf.reduce_mean(ADV * neglogpac)

        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # Value loss
        vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R)

        # TD loss
        # td_loss = losses.mean_squared_error(tf.squeeze(train_model.dt), TD)

        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
        """prio model"""
        with tf.variable_scope('a2c_model_prio', reuse=tf.AUTO_REUSE):
            # prio_model = policy(nbatch, nsteps, sess)
            prio_model = MyNN(env, nbatch, network)

        P_R = tf.placeholder(tf.float32, [nbatch])
        PRIO = tf.placeholder(tf.float32, [nbatch])
        P_LR = tf.placeholder(tf.float32, [])

        # prio_model_loss = losses.mean_squared_error(tf.squeeze(prio_model.out), P_R) # Reward
        prio_model_loss = losses.mean_squared_error(tf.squeeze(prio_model.out),
                                                    PRIO)  # TD Error
        # Update parameters using loss
        # 1. Get the model parameters
        params = find_trainable_variables("a2c_model")
        params_prio = find_trainable_variables("a2c_model_prio")

        # 2. Calculate the gradients
        grads = tf.gradients(loss, params)
        prio_grads = tf.gradients(prio_model_loss, params_prio)
        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
            prio_grads, prio_grad_norm = tf.clip_by_global_norm(
                prio_grads, max_grad_norm)
        grads = list(zip(grads, params))
        prio_grads = list(zip(prio_grads, params_prio))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        # 3. Make op for one policy and value update step of A2C
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=alpha,
                                            epsilon=epsilon)
        prio_trainer = tf.train.RMSPropOptimizer(learning_rate=P_LR,
                                                 decay=alpha,
                                                 epsilon=epsilon)

        _train = trainer.apply_gradients(grads)
        _prio_train = prio_trainer.apply_gradients(prio_grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
            # rewards = R + yV(s')
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()

            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: cur_lr
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)

            prio_loss = 0
            if self.prio_args is not None:
                prio_values = GetValuesForPrio(self.prio_args['prio_type'],
                                               self.prio_args['prio_param'],
                                               advs, rewards)
                prio_td_map = {
                    prio_model.X: obs,
                    P_R: rewards,
                    P_LR: cur_lr,
                    PRIO: prio_values
                }

                prio_loss, _, p_td = sess.run(
                    [prio_model_loss, _prio_train, PRIO], prio_td_map)
                # mb aranged as 1D-vector = [[env_1: n1, ..., n_nstep],...,[env_n_active]]
                # need to take last value of each env's buffer
                self.prio_score = prio_values[list(
                    filter(lambda x: x % nsteps == (nsteps - 1),
                           range(len(prio_values))))]
            return policy_loss, value_loss, policy_entropy, prio_loss

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.prio_model = prio_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = functools.partial(tf_util.save_variables, sess=sess)
        self.load = functools.partial(tf_util.load_variables, sess=sess)
        tf.global_variables_initializer().run(session=sess)
예제 #20
0
    def __init__(self,
                 policy,
                 env,
                 nsteps,
                 ent_coef=1,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lrschedule='linear',
                 fisher_matrix=None,
                 star_param=None,
                 lam=None,
                 batch_size=None):

        sess = tf_util.get_session()
        nbatch = batch_size

        with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE):

            # train_model is used to train our network
            train_model = policy(nbatch, 1, sess)
            eval_model = policy(1, 1, sess)

        OUTPUT = tf.placeholder(tf.float32, [None, 6], name='sample_action')
        # Calculate the loss
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss
        # constant = tf.constant(0.01,shape=train_model.pi.get_shape())

        pi = tf.nn.softmax(OUTPUT)
        # u = tf.argmax(pi,axis=-1)
        # x = tf.one_hot(u, pi.get_shape().as_list()[-1])
        # cross_loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=train_model.pi,labels=x))

        train_model_pi = tf.nn.softmax(train_model.pi3)
        KL = pi * tf.log(pi / (train_model_pi + 1e-4))
        KL_loss = tf.reduce_mean(tf.reduce_sum(KL, 1))
        # Update parameters using lossa.
        # 1. Get the model parameters
        params = find_trainable_variables("a2c_model")
        print(len(params))

        ewc_loss = 0
        #star_param是load进去的array|params是模型的参数,也是tensor
        # for i in range(len(fisher_matrix)):
        #     j = 0
        #     for v in range(len(params)-8):
        #         if v % 2 == 0:
        #             C = star_param[v].shape[-1]
        #             W = tf.transpose(tf.reshape(params[v] - star_param[v],[-1,C]))
        #             right = tf.reshape(tf.transpose(tf.matmul(tf.matmul(fisher_matrix[i][j+1], W), fisher_matrix[i][j])),[-1,1])
        #             W_ = tf.reshape(params[v] - star_param[v], [1,-1])
        #             ewc_loss += tf.matmul(W_, right)
        #             j = j + 2
        #         else:
        #             B = tf.reshape(params[v] - star_param[v], [-1,1])
        #             right_B = tf.matmul(fisher_matrix[i][j], B)
        #             B_ = tf.reshape(params[v] - star_param[v], [1,-1])
        #             ewc_loss += tf.matmul(B_, right_B)
        #             j += 1

        # for v in range(len(params)-4):
        #     if v % 2 == 0:
        #         C = int(star_param[v].shape[-1])
        #         W_hat = tf.concat([tf.reshape(params[v], [-1, C]), tf.reshape(params[v + 1], [1, -1])], 0)
        #         W_hat_fixed = tf.concat(
        #             [tf.reshape(star_param[v], [-1, C]), tf.reshape(star_param[v + 1], [1, -1])], 0)
        #
        #         W = tf.transpose(W_hat - W_hat_fixed)
        #         right = tf.reshape(tf.transpose(tf.matmul(tf.matmul(fisher_matrix[v + 1], W), fisher_matrix[v])),[-1, 1])
        #         W_ = tf.reshape(W_hat - W_hat_fixed, [1, -1])
        #         ewc_loss += tf.matmul(W_, right)
        for i in range(len(fisher_matrix)):
            for v in range(len(params) - 6):
                # if v == 6:
                ewc_loss += tf.reduce_sum(
                    tf.multiply(fisher_matrix[i][v].astype(np.float32),
                                tf.square(params[v] - star_param[v])))

        loss1 = KL_loss * ent_coef
        loss2 = ewc_loss * (lam / 2)
        loss = loss1 + loss2
        # 2. Calculate the gradients
        trainer = tf.train.AdamOptimizer(learning_rate=1e-3)
        # trainer = tf.train.RMSPropOptimizer(learning_rate=7e-4, decay=alpha, epsilon=epsilon)
        # grads_and_var = trainer.compute_gradients(loss, params)
        # grads, var = zip(*grads_and_var)
        # grads, _grad_norm = tf.clip_by_global_norm(grads, 0.5)
        #
        # grads_and_var = list(zip(grads, var))

        grads = tf.gradients(loss, params)

        grads_and_var = list(zip(grads, params))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        # 3. Make op for one policy and value update step of A2C
        # trainer = tf.train.RMSPropOptimizer(learning_rate=7e-4, decay=alpha, epsilon=epsilon)

        # trainer = tf.train.MomentumOptimizer(learning_rate=0.001,momentum=0.9,use_nesterov=True)
        _train = trainer.apply_gradients(grads_and_var)

        #
        #
        # grads2 = tf.gradients(loss2,params)
        # grads2 = list(zip(grads2, params))
        # trainer2 = tf.train.RMSPropOptimizer(learning_rate=7e-4, decay=alpha, epsilon=epsilon)
        # trainer2 = tf.train.AdamOptimizer(learning_rate=1e-4)
        # trainer2 = tf.train.MomentumOptimizer(learning_rate=0.001, momentum=0.9, use_nesterov=True)
        # _train2 = trainer2.apply_gradients(grads2)

        def train(obs, actions):

            td_map = {
                train_model.X: obs,
                OUTPUT: actions,
                train_model.keep_prob: 1.0
            }

            kl, ewc, l, _ = sess.run([KL_loss, ewc_loss, loss, _train], td_map)

            return kl, ewc, l

            # KL,ewc,_ = sess.run([KL_loss,ewc_loss,_train],td_map)

            # return KL, ewc

        def creat_star_param():
            star_list = []
            for i in params[:-2]:
                star_list.append(star_param[params[i].name])
            return star_list

        self.creat_star_param = creat_star_param
        self.train = train
        self.train_model = train_model
        self.act = eval_model.step
        self.act2 = eval_model.step2
        self.act3 = eval_model.step3
        self.save = functools.partial(tf_util.save_variables, sess=sess)
        self.load = functools.partial(tf_util.load_variables, sess=sess)
        tf.global_variables_initializer().run(session=sess)
예제 #21
0
def learn_vanilla_a2c(
    network,
    env,
    optimiser,
    seed=None,
    nsteps=5,
    total_timesteps=int(80e6),
    vf_coef=0.5,
    ent_coef=0.01,
    max_grad_norm=0.5,
    lr=7e-4,
    lrschedule='linear',
    epsilon=1e-5,
    alpha=0.99,
    gamma=0.99,
    log_interval=100,
    load_path=None,
    **network_kwargs):

    '''
    Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm.
    Parameters:
    -----------
    network:            policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                        specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                        tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                        neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                        See baselines.common/policies.py/lstm for more details on using recurrent nets in policies
    env:                RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py)
    seed:               seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible)
    nsteps:             int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                        nenv is number of environment copies simulated in parallel)
    total_timesteps:    int, total number of timesteps to train on (default: 80M)
    vf_coef:            float, coefficient in front of value function loss in the total loss function (default: 0.5)
    ent_coef:           float, coeffictiant in front of the policy entropy in the total loss function (default: 0.01)
    max_gradient_norm:  float, gradient is clipped to have global L2 norm no more than this value (default: 0.5)
    lr:                 float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)
    lrschedule:         schedule of learning rate. Can be 'linear', 'constant', or a function [0..1] -> [0..1] that takes fraction of the training progress as input and
                        returns fraction of the learning rate (specified as lr) as output
    epsilon:            float, RMSProp epsilon (stabilizes square root computation in denominator of RMSProp update) (default: 1e-5)
    alpha:              float, RMSProp decay parameter (default: 0.99)
    gamma:              float, reward discounting parameter (default: 0.99)
    log_interval:       int, specifies how frequently the logs are printed out (default: 100)
    **network_kwargs:   keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                        For instance, 'mlp' network architecture has arguments num_hidden and num_layers.
    '''



    set_global_seeds(seed)

    # Get the nb of env
    nenvs = env.num_envs
    policy = build_policy(env, network, **network_kwargs)

    # Instantiate the model object (that creates step_model and train_model)
    model = Model(optimiser=optimiser, policy=policy, env=env, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef,
        max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule)
    if load_path is not None:
        model.load(load_path)

    # Instantiate the runner object
    runner = VanillaRunner(env, model, nsteps=nsteps, gamma=gamma)
    epinfobuf = deque(maxlen=100)

    # Calculate the batch_size
    nbatch = nenvs*nsteps
    
    def kl(new_mean, new_sd, old_mean, old_sd): # added for logging
        approx_kl = np.log(new_sd/old_sd) + (old_sd**2 + (old_mean - new_mean)**2)/(2.0*new_sd**2 + 10**-8) - 0.5
        approx_kl = np.sum(approx_kl, axis=1)
        approx_kl = np.mean(approx_kl)
        return approx_kl
        
    # model helper functions
    model_params = find_trainable_variables("a2c_model")
    get_flat = U.GetFlat(model_params)

    # Start total timer
    tstart = time.time()

    for update in range(1, int(total_timesteps//nbatch+1)):
        old_params = get_flat()

        # Get mini batch of experiences
        obs, states, rewards, masks, actions, values, epinfos = runner.run()
        epinfobuf.extend(epinfos)
        old_mean, old_sd = model.get_mean_std(obs) # added

        policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
        nseconds = time.time()-tstart

        #added
        new_mean, new_sd = model.get_mean_std(obs)
        approx_kl = kl(new_mean, new_sd, old_mean, old_sd)

        new_params = get_flat()
        grads = new_params - old_params

        # Calculate the fps (frame per second)
        fps = int((update*nbatch)/nseconds)
        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, rewards)
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update*nbatch)
            logger.record_tabular("fps", fps)
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular("explained_variance", float(ev))
            logger.record_tabular("approx_kl", float(approx_kl))
            logger.record_tabular("grad_norm", np.linalg.norm(grads))
            logger.record_tabular("eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.record_tabular("eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.dump_tabular()
    return model
    def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack,
                 num_procs, ent_coef, vf_coef, max_grad_norm, lr, rprop_alpha,
                 rprop_epsilon, total_timesteps, lrschedule):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=num_procs,
                                inter_op_parallelism_threads=num_procs)
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)
        nbatch = nenvs * nsteps

        step_model = policy(sess,
                            ob_space,
                            ac_space,
                            nenvs,
                            1,
                            nstack,
                            reuse=False)
        train_model = policy(sess,
                             ob_space,
                             ac_space,
                             nenvs,
                             nsteps,
                             nstack,
                             reuse=True)

        A = train_model.pdtype.sample_placeholder([nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        eps = 1e-6

        #nadv = ADV / (train_model.ret_rms.std + eps)
        #nr = (R - train_model.ret_rms.mean) / (train_model.ret_rms.std + eps)

        nadv = (ADV - train_model.ret_rms.mean) / (train_model.ret_rms.std +
                                                   eps)
        nr = (R - train_model.ret_rms.mean) / (train_model.ret_rms.std + eps)

        nlogpac = -train_model.pd.logp(A)
        pg_loss = tf.reduce_mean(nadv * nlogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), nr))
        #vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vnorm), nr))

        entropy = tf.reduce_mean(train_model.pd.entropy())
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=rprop_alpha,
                                            epsilon=rprop_epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
        avg_norm_ret = tf.reduce_mean(tf.abs(train_model.ret_rms.mean))
        avg_norm_obs = tf.reduce_mean(tf.abs(train_model.ob_rms.mean))

        def train(obs, states, returns, masks, actions, values):

            advs = returns - values
            #advs = (advs - np.mean(advs)) / (np.std(advs) + eps)
            for step in range(len(obs)):
                cur_lr = lr.value()
            if hasattr(train_model, "ob_rms"):
                train_model.ob_rms.update(
                    sess,
                    obs)  # update running mean/std for observations of policy
            if hasattr(train_model, "ret_rms"):
                train_model.ret_rms.update(
                    sess, returns)  # # update running mean/std for returns
            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: returns,
                LR: cur_lr
            }
            if states != []:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            ravg_norm_obs, policy_loss, value_loss, policy_entropy, _ = sess.run(
                [avg_norm_obs, pg_loss, vf_loss, entropy, _train], td_map)
            return ravg_norm_obs, policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            make_path(save_path)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
예제 #23
0
    def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef, q_coef, gamma, max_grad_norm, lr,
                 rprop_alpha, rprop_epsilon, total_timesteps, lrschedule,
                 c, trust_region, alpha, delta, icm ):

        sess = get_session()
        nact = ac_space.n
        nbatch = nenvs * nsteps

        A = tf.placeholder(tf.int32, [nbatch]) # actions
        D = tf.placeholder(tf.float32, [nbatch]) # dones
        R = tf.placeholder(tf.float32, [nbatch]) # rewards, not returns
        MU = tf.placeholder(tf.float32, [nbatch, nact]) # mu's
        LR = tf.placeholder(tf.float32, [])
        eps = 1e-6

        step_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs,) + ob_space.shape)
        train_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs*(nsteps+1),) + ob_space.shape)
        with tf.variable_scope('acer_model', reuse=tf.AUTO_REUSE):

            step_model = policy(nbatch=nenvs , nsteps=1,observ_placeholder=step_ob_placeholder, sess=sess)
            train_model = policy(nbatch=nbatch , nsteps=nsteps, observ_placeholder=train_ob_placeholder, sess=sess)


        params = find_trainable_variables("acer_model")
        print("Params {}".format(len(params)))
        # for var in params:
        #     print(var)

        # create polyak averaged model
        ema = tf.train.ExponentialMovingAverage(alpha)
        ema_apply_op = ema.apply(params)

        def custom_getter(getter, *args, **kwargs):
            v = ema.average(getter(*args, **kwargs))
            print(v.name)
            return v

        with tf.variable_scope("acer_model", custom_getter=custom_getter, reuse=True):
            polyak_model = policy(nbatch=nbatch, nsteps=nsteps, observ_placeholder=train_ob_placeholder, sess=sess)

        # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i
        # shape is [n_envs * (n_steps + 1)]

        # action probability distributions according to train_model, polyak_model and step_model
        # poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax
        train_model_p = tf.nn.softmax(train_model.pi)
        polyak_model_p = tf.nn.softmax(polyak_model.pi)
        step_model_p = tf.nn.softmax(step_model.pi)
        # train model policy probility and train model q value
        v = tf.reduce_sum(train_model_p * train_model.q, axis = -1) # shape is [nenvs * (nsteps + 1)]

        # strip off last step
        # dictribution_f , f_polyak, q_value
        f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model_p, polyak_model_p, train_model.q])
        # Get pi and q values for actions taken
        f_i = get_by_index(f, A)
        q_i = get_by_index(q, A)

        # Compute ratios for importance truncation
        rho = f / (MU + eps)
        rho_i = get_by_index(rho, A)

        # Calculate Q_retrace targets
        # passed 
        # R = reward , D = done_ph , v = value ,... rest is same
        qret = q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma)

        # Calculate losses
        # Entropy
        # entropy = tf.reduce_mean(strip(train_model.pd.entropy(), nenvs, nsteps))
        entropy = tf.reduce_mean(cat_entropy_softmax(f)) # f is distribution here 

        # Policy Graident loss, with truncated importance sampling & bias correction
        v = strip(v, nenvs, nsteps, True) # v is value here
        check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4)
        check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2)

        # Truncated importance sampling
        if icm is None :

            adv = qret - v # v is value here

        else : 
            # print("Adv Normalization")
            # > Advantage Normalization
            adv = qret - v
            # m , s = get_mean_and_std(icm_adv)

            # advs = (icm_adv - m) / (s + 1e-7)
            # > Advantage Normalization


        logf = tf.log(f_i + eps)
        #  c is correction term 
        #  importance weight clipping factor : c
        gain_f = logf * tf.stop_gradient(adv * tf.minimum(c, rho_i))  # [nenvs * nsteps]
        loss_f = -tf.reduce_mean(gain_f)

        # Bias correction for the truncation
        adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1]))  # [nenvs * nsteps, nact]
        logf_bc = tf.log(f + eps) # / (f_old + eps)
        check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]]*2)
        gain_bc = tf.reduce_sum(logf_bc * tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f), axis = 1) #IMP: This is sum, as expectation wrt f
        loss_bc= -tf.reduce_mean(gain_bc)

       #  IMP: This is sum, as expectation wrt f

        loss_policy = loss_f + loss_bc

        # Value/Q function loss, and explained variance
        check_shape([qret, q_i], [[nenvs * nsteps]]*2)
        ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]), tf.reshape(qret, [nenvs, nsteps]))
        loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i)*0.5)

        # Net loss
        check_shape([loss_policy, loss_q, entropy], [[]] * 3)
        loss = loss_policy + q_coef * loss_q - ent_coef * entropy

        if trust_region:
            g = tf.gradients(- (loss_policy - ent_coef * entropy) * nsteps * nenvs, f) #[nenvs * nsteps, nact]
            # k = tf.gradients(KL(f_pol || f), f)
            k = - f_pol / (f + eps) #[nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f
            k_dot_g = tf.reduce_sum(k * g, axis=-1)
            adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) / (tf.reduce_sum(tf.square(k), axis=-1) + eps)) #[nenvs * nsteps]

            # Calculate stats (before doing adjustment) for logging.
            avg_norm_k = avg_norm(k)
            avg_norm_g = avg_norm(g)
            avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g))
            avg_norm_adj = tf.reduce_mean(tf.abs(adj))

            g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k
            grads_f = -g/(nenvs*nsteps) # These are turst region adjusted gradients wrt f ie statistics of policy pi
            grads_policy = tf.gradients(f, params, grads_f)
            grads_q = tf.gradients(loss_q * q_coef, params)
            grads = [gradient_add(g1, g2, param) for (g1, g2, param) in zip(grads_policy, grads_q, params)]

            avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs)
            norm_grads_q = tf.global_norm(grads_q)
            norm_grads_policy = tf.global_norm(grads_policy)
        else:
            grads = tf.gradients(loss, params)

        if max_grad_norm is not None:
            grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))

        if icm is not None :
            # print("with ICM")
            grads = grads + icm.pred_grads_and_vars



        trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=rprop_alpha, epsilon=rprop_epsilon)
        _opt_op = trainer.apply_gradients(grads)

        # so when you call _train, you first do the gradient step, then you apply ema
        with tf.control_dependencies([_opt_op]):
            _train = tf.group(ema_apply_op)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        # Ops/Summaries to run, and their names for logging
        
        if icm is not None :
            # print("With ICM")
            run_ops = [_train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev, norm_grads , 
            icm.forw_loss , icm.inv_loss, icm.icm_loss]
            names_ops = ['loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance',
                     'norm_grads' ,'icm.forw_loss' , 'icm.inv_loss', 'icm.icm_loss' ]

            if trust_region:
                run_ops = run_ops + [norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g,
                                     avg_norm_adj  ]
                names_ops = names_ops + ['norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g',
                                         'avg_norm_k_dot_g', 'avg_norm_adj'  ]
        else :
            run_ops = [_train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev, norm_grads ]
            names_ops = ['loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance',
                     'norm_grads' ]

            if trust_region:
                run_ops = run_ops + [norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g,
                                     avg_norm_adj  ]
                names_ops = names_ops + ['norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g',
                                         'avg_norm_k_dot_g', 'avg_norm_adj' ]


        def train(obs, actions, rewards, dones, mus, states, masks, steps, next_states, icm_actions  ):
            cur_lr = lr.value_steps(steps)
            
            if icm is not None :
                print("with ICM ")
                td_map = {train_model.X: obs, polyak_model.X: obs, A: actions, R: rewards, D: dones, MU: mus, LR: cur_lr , 
                 icm.state_:obs, icm.next_state_ : next_states , icm.action_ : icm_actions}
            else :
                td_map = {train_model.X: obs, polyak_model.X: obs, A: actions, R: rewards, D: dones, MU: mus, LR: cur_lr}

            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
                td_map[polyak_model.S] = states
                td_map[polyak_model.M] = masks

            return names_ops, sess.run(run_ops, td_map)[1:]  # strip off _train

        def _step(observation, **kwargs):
            return step_model._evaluate([step_model.action, step_model_p, step_model.state], observation, **kwargs)



        self.train = train
        self.save = functools.partial(save_variables, sess=sess, variables=params)
        self.train_model = train_model
        self.step_model = step_model
        self._step = _step
        self.step = self.step_model.step

        self.initial_state = step_model.initial_state
        tf.global_variables_initializer().run(session=sess)
예제 #24
0
파일: acer.py 프로젝트: MrGoogol/baselines
    def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, ent_coef, q_coef, gamma, max_grad_norm, lr,
                 rprop_alpha, rprop_epsilon, total_timesteps, lrschedule,
                 c, trust_region, alpha, delta):

        sess = get_session()
        nact = ac_space.n
        nbatch = nenvs * nsteps

        A = tf.placeholder(tf.int32, [nbatch]) # actions
        D = tf.placeholder(tf.float32, [nbatch]) # dones
        R = tf.placeholder(tf.float32, [nbatch]) # rewards, not returns
        MU = tf.placeholder(tf.float32, [nbatch, nact]) # mu's
        LR = tf.placeholder(tf.float32, [])
        eps = 1e-6

        step_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs,) + ob_space.shape)
        train_ob_placeholder = tf.placeholder(dtype=ob_space.dtype, shape=(nenvs*(nsteps+1),) + ob_space.shape)
        with tf.variable_scope('acer_model', reuse=tf.AUTO_REUSE):

            step_model = policy(nbatch=nenvs, nsteps=1, observ_placeholder=step_ob_placeholder, sess=sess)
            train_model = policy(nbatch=nbatch, nsteps=nsteps, observ_placeholder=train_ob_placeholder, sess=sess)


        params = find_trainable_variables("acer_model")
        print("Params {}".format(len(params)))
        for var in params:
            print(var)

        # create polyak averaged model
        ema = tf.train.ExponentialMovingAverage(alpha)
        ema_apply_op = ema.apply(params)

        def custom_getter(getter, *args, **kwargs):
            v = ema.average(getter(*args, **kwargs))
            print(v.name)
            return v

        with tf.variable_scope("acer_model", custom_getter=custom_getter, reuse=True):
            polyak_model = policy(nbatch=nbatch, nsteps=nsteps, observ_placeholder=train_ob_placeholder, sess=sess)

        # Notation: (var) = batch variable, (var)s = seqeuence variable, (var)_i = variable index by action at step i

        # action probability distributions according to train_model, polyak_model and step_model
        # poilcy.pi is probability distribution parameters; to obtain distribution that sums to 1 need to take softmax
        train_model_p = tf.nn.softmax(train_model.pi)
        polyak_model_p = tf.nn.softmax(polyak_model.pi)
        step_model_p = tf.nn.softmax(step_model.pi)
        v = tf.reduce_sum(train_model_p * train_model.q, axis = -1) # shape is [nenvs * (nsteps + 1)]

        # strip off last step
        f, f_pol, q = map(lambda var: strip(var, nenvs, nsteps), [train_model_p, polyak_model_p, train_model.q])
        # Get pi and q values for actions taken
        f_i = get_by_index(f, A)
        q_i = get_by_index(q, A)

        # Compute ratios for importance truncation
        rho = f / (MU + eps)
        rho_i = get_by_index(rho, A)

        # Calculate Q_retrace targets
        qret = q_retrace(R, D, q_i, v, rho_i, nenvs, nsteps, gamma)

        # Calculate losses
        # Entropy
        # entropy = tf.reduce_mean(strip(train_model.pd.entropy(), nenvs, nsteps))
        entropy = tf.reduce_mean(cat_entropy_softmax(f))

        # Policy Graident loss, with truncated importance sampling & bias correction
        v = strip(v, nenvs, nsteps, True)
        check_shape([qret, v, rho_i, f_i], [[nenvs * nsteps]] * 4)
        check_shape([rho, f, q], [[nenvs * nsteps, nact]] * 2)

        # Truncated importance sampling
        adv = qret - v
        logf = tf.log(f_i + eps)
        gain_f = logf * tf.stop_gradient(adv * tf.minimum(c, rho_i))  # [nenvs * nsteps]
        loss_f = -tf.reduce_mean(gain_f)

        # Bias correction for the truncation
        adv_bc = (q - tf.reshape(v, [nenvs * nsteps, 1]))  # [nenvs * nsteps, nact]
        logf_bc = tf.log(f + eps) # / (f_old + eps)
        check_shape([adv_bc, logf_bc], [[nenvs * nsteps, nact]]*2)
        gain_bc = tf.reduce_sum(logf_bc * tf.stop_gradient(adv_bc * tf.nn.relu(1.0 - (c / (rho + eps))) * f), axis = 1) #IMP: This is sum, as expectation wrt f
        loss_bc= -tf.reduce_mean(gain_bc)

        loss_policy = loss_f + loss_bc

        # Value/Q function loss, and explained variance
        check_shape([qret, q_i], [[nenvs * nsteps]]*2)
        ev = q_explained_variance(tf.reshape(q_i, [nenvs, nsteps]), tf.reshape(qret, [nenvs, nsteps]))
        loss_q = tf.reduce_mean(tf.square(tf.stop_gradient(qret) - q_i)*0.5)

        # Net loss
        check_shape([loss_policy, loss_q, entropy], [[]] * 3)
        loss = loss_policy + q_coef * loss_q - ent_coef * entropy

        if trust_region:
            g = tf.gradients(- (loss_policy - ent_coef * entropy) * nsteps * nenvs, f) #[nenvs * nsteps, nact]
            # k = tf.gradients(KL(f_pol || f), f)
            k = - f_pol / (f + eps) #[nenvs * nsteps, nact] # Directly computed gradient of KL divergence wrt f
            k_dot_g = tf.reduce_sum(k * g, axis=-1)
            adj = tf.maximum(0.0, (tf.reduce_sum(k * g, axis=-1) - delta) / (tf.reduce_sum(tf.square(k), axis=-1) + eps)) #[nenvs * nsteps]

            # Calculate stats (before doing adjustment) for logging.
            avg_norm_k = avg_norm(k)
            avg_norm_g = avg_norm(g)
            avg_norm_k_dot_g = tf.reduce_mean(tf.abs(k_dot_g))
            avg_norm_adj = tf.reduce_mean(tf.abs(adj))

            g = g - tf.reshape(adj, [nenvs * nsteps, 1]) * k
            grads_f = -g/(nenvs*nsteps) # These are turst region adjusted gradients wrt f ie statistics of policy pi
            grads_policy = tf.gradients(f, params, grads_f)
            grads_q = tf.gradients(loss_q * q_coef, params)
            grads = [gradient_add(g1, g2, param) for (g1, g2, param) in zip(grads_policy, grads_q, params)]

            avg_norm_grads_f = avg_norm(grads_f) * (nsteps * nenvs)
            norm_grads_q = tf.global_norm(grads_q)
            norm_grads_policy = tf.global_norm(grads_policy)
        else:
            grads = tf.gradients(loss, params)

        if max_grad_norm is not None:
            grads, norm_grads = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=rprop_alpha, epsilon=rprop_epsilon)
        _opt_op = trainer.apply_gradients(grads)

        # so when you call _train, you first do the gradient step, then you apply ema
        with tf.control_dependencies([_opt_op]):
            _train = tf.group(ema_apply_op)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        # Ops/Summaries to run, and their names for logging
        run_ops = [_train, loss, loss_q, entropy, loss_policy, loss_f, loss_bc, ev, norm_grads]
        names_ops = ['loss', 'loss_q', 'entropy', 'loss_policy', 'loss_f', 'loss_bc', 'explained_variance',
                     'norm_grads']
        if trust_region:
            run_ops = run_ops + [norm_grads_q, norm_grads_policy, avg_norm_grads_f, avg_norm_k, avg_norm_g, avg_norm_k_dot_g,
                                 avg_norm_adj]
            names_ops = names_ops + ['norm_grads_q', 'norm_grads_policy', 'avg_norm_grads_f', 'avg_norm_k', 'avg_norm_g',
                                     'avg_norm_k_dot_g', 'avg_norm_adj']

        def train(obs, actions, rewards, dones, mus, states, masks, steps):
            cur_lr = lr.value_steps(steps)
            td_map = {train_model.X: obs, polyak_model.X: obs, A: actions, R: rewards, D: dones, MU: mus, LR: cur_lr}
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
                td_map[polyak_model.S] = states
                td_map[polyak_model.M] = masks

            return names_ops, sess.run(run_ops, td_map)[1:]  # strip off _train

        def _step(observation, **kwargs):
            return step_model._evaluate([step_model.action, step_model_p, step_model.state], observation, **kwargs)



        self.train = train
        self.save = functools.partial(save_variables, sess=sess, variables=params)
        self.train_model = train_model
        self.step_model = step_model
        self._step = _step
        self.step = self.step_model.step

        self.initial_state = step_model.initial_state
        tf.global_variables_initializer().run(session=sess)
예제 #25
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 n_envs,
                 total_timesteps,
                 nprocs=32,
                 n_steps=20,
                 ent_coef=0.01,
                 vf_coef=0.25,
                 vf_fisher_coef=1.0,
                 learning_rate=0.25,
                 max_grad_norm=0.5,
                 kfac_clip=0.001,
                 lr_schedule='linear'):
        """
        The ACKTR (Actor Critic using Kronecker-Factored Trust Region) model class, https://arxiv.org/abs/1708.05144

        :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...)
        :param ob_space: (Gym Space) The observation space
        :param ac_space: (Gym Space) The action space
        :param n_envs: (int) The number of environments
        :param total_timesteps: (int) The total number of timesteps for training the model
        :param nprocs: (int) The number of threads for TensorFlow operations
        :param n_steps: (int) The number of steps to run for each environment
        :param ent_coef: (float) The weight for the entropic loss
        :param vf_coef: (float) The weight for the loss on the value function
        :param vf_fisher_coef: (float) The weight for the fisher loss on the value function
        :param learning_rate: (float) The initial learning rate for the RMS prop optimizer
        :param max_grad_norm: (float) The clipping value for the maximum gradient
        :param kfac_clip: (float) gradient clipping for Kullback leiber
        :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant',
                                 'double_linear_con', 'middle_drop' or 'double_middle_drop')
        """

        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=nprocs,
                                inter_op_parallelism_threads=nprocs)
        config.gpu_options.allow_growth = True
        self.sess = sess = tf.Session(config=config)
        n_batch = n_envs * n_steps
        action_ph = tf.placeholder(tf.int32, [n_batch])
        advs_ph = tf.placeholder(tf.float32, [n_batch])
        rewards_ph = tf.placeholder(tf.float32, [n_batch])
        pg_lr_ph = tf.placeholder(tf.float32, [])

        self.model = step_model = policy(sess,
                                         ob_space,
                                         ac_space,
                                         n_envs,
                                         1,
                                         reuse=False)
        self.model2 = train_model = policy(sess,
                                           ob_space,
                                           ac_space,
                                           n_envs * n_steps,
                                           n_steps,
                                           reuse=True)

        logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.policy, labels=action_ph)
        self.logits = train_model.policy

        # training loss
        pg_loss = tf.reduce_mean(advs_ph * logpac)
        entropy = tf.reduce_mean(calc_entropy(train_model.policy))
        pg_loss = pg_loss - ent_coef * entropy
        vf_loss = mse(tf.squeeze(train_model.value_fn), rewards_ph)
        train_loss = pg_loss + vf_coef * vf_loss

        # Fisher loss construction
        self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac)
        sample_net = train_model.value_fn + tf.random_normal(
            tf.shape(train_model.value_fn))
        self.vf_fisher = vf_fisher_loss = -vf_fisher_coef * tf.reduce_mean(
            tf.pow(train_model.value_fn - tf.stop_gradient(sample_net), 2))
        self.joint_fisher = pg_fisher_loss + vf_fisher_loss

        self.params = params = find_trainable_variables("model")

        self.grads_check = grads = tf.gradients(train_loss, params)

        with tf.device('/gpu:0'):
            self.optim = optim = kfac.KfacOptimizer(
                learning_rate=pg_lr_ph,
                clip_kl=kfac_clip,
                momentum=0.9,
                kfac_update=1,
                epsilon=0.01,
                stats_decay=0.99,
                async=1,
                cold_iter=10,
                max_grad_norm=max_grad_norm)

            optim.compute_and_apply_stats(self.joint_fisher, var_list=params)
            train_op, q_runner = optim.apply_gradients(list(zip(grads,
                                                                params)))
        self.q_runner = q_runner
        self.learning_rate = Scheduler(initial_value=learning_rate,
                                       n_values=total_timesteps,
                                       schedule=lr_schedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for _ in range(len(obs)):
                cur_lr = self.learning_rate.value()

            td_map = {
                train_model.obs_ph: obs,
                action_ph: actions,
                advs_ph: advs,
                rewards_ph: rewards,
                pg_lr_ph: cur_lr
            }
            if states is not None:
                td_map[train_model.states_ph] = states
                td_map[train_model.masks_ph] = masks

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, train_op], td_map)
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            session_params = sess.run(params)
            joblib.dump(session_params, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for param, loaded_p in zip(params, loaded_params):
                restores.append(param.assign(loaded_p))
            sess.run(restores)

        self.train = train
        self.save = save
        self.load = load
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        tf.global_variables_initializer().run(session=sess)
예제 #26
0
    def __init__(self,
                 policy,
                 ent_coef=1,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lrschedule='linear',
                 batch_size=None):

        sess = tf_util.get_session()
        nbatch = batch_size

        with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE):

            # train_model is used to train our network
            train_model = policy(nbatch, 1, sess)
            eval_model = policy(1, 1, sess)

        A = tf.placeholder(train_model.action.dtype, train_model.action.shape)

        OUTPUT = tf.placeholder(tf.float32, [None, 6])

        # Calculate the loss
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss
        pi = tf.nn.softmax(OUTPUT)

        train_model_pi = tf.nn.softmax(train_model.pi2)
        # Policy loss
        # neglogpac = train_model.pd.neglogp(A)
        # L = A(s,a) * -logpi(a|s)
        # entropy_loss = tf.reduce_mean(neglogpac)

        KL = pi * tf.log(pi / train_model_pi)
        KL_loss = tf.reduce_mean(tf.reduce_sum(KL, 1))

        # Update parameters using loss
        # 1. Get the model parameters
        params = find_trainable_variables("a2c_model")
        # regularization_loss = tf.contrib.layers.apply_regularization(regularizer=tf.contrib.layers.l2_regularizer(1e-4),
        #                                                              weights_list=params)
        loss = KL_loss * ent_coef
        # 2. Calculate the gradients
        grads = tf.gradients(loss, params)
        grads = list(zip(grads, params))
        # 3. Make op for one policy and value update step of A2C
        # trainer = tf.train.RMSPropOptimizer(learning_rate=7e-4, decay=alpha, epsilon=epsilon)
        trainer = tf.train.AdamOptimizer(learning_rate=1e-3)

        _train = trainer.apply_gradients(grads)

        def train(obs, actions):
            # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
            # rewards = R + yV(s')

            td_map = {
                train_model.X: obs,
                OUTPUT: actions,
                train_model.keep_prob: 1.0
            }

            l, _ = sess.run([loss, _train], td_map)
            return l

        self.train = train
        self.train_model = train_model
        self.act = eval_model.step
        self.act2 = eval_model.step2
        self.save = functools.partial(tf_util.save_variables, sess=sess)
        self.load = functools.partial(tf_util.load_variables, sess=sess)
        tf.global_variables_initializer().run(session=sess)
예제 #27
0
def learn_ent_hoof_a2c(network,
                       env,
                       optimiser,
                       seed=None,
                       nsteps=5,
                       total_timesteps=int(1e6),
                       lr_upper_bound=None,
                       ent_upper_bound=None,
                       num_lr=None,
                       num_ent_coeff=None,
                       gamma=0.99,
                       max_kl=None,
                       max_grad_norm=0.5,
                       log_interval=100,
                       load_path=None,
                       **network_kwargs):
    '''
    Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm.

    Parameters:
    -----------

    network:            policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list)
                        specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns
                        tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward
                        neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets.
                        See baselines.common/policies.py/lstm for more details on using recurrent nets in policies


    env:                RL environment. Should implement interface similar to VecEnv (baselines.common/vec_env) or be wrapped with DummyVecEnv (baselines.common/vec_env/dummy_vec_env.py)


    seed:               seed to make random number sequence in the alorightm reproducible. By default is None which means seed from system noise generator (not reproducible)

    nsteps:             int, number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where
                        nenv is number of environment copies simulated in parallel)

    total_timesteps:    int, total number of timesteps to train on (default: 80M)

    max_gradient_norm:  float, gradient is clipped to have global L2 norm no more than this value (default: 0.5)

    lr:                 float, learning rate for RMSProp (current implementation has RMSProp hardcoded in) (default: 7e-4)

    gamma:              float, reward discounting parameter (default: 0.99)

    log_interval:       int, specifies how frequently the logs are printed out (default: 100)

    **network_kwargs:   keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network
                        For instance, 'mlp' network architecture has arguments num_hidden and num_layers.

    '''

    set_global_seeds(seed)

    # Get the nb of env
    nenvs = env.num_envs
    policy = build_policy(env, network, **network_kwargs)

    # Instantiate the model object (that creates step_model and train_model)
    model = Ent_HOOF_Model(optimiser=optimiser,
                           policy=policy,
                           env=env,
                           nsteps=nsteps,
                           total_timesteps=total_timesteps,
                           max_grad_norm=max_grad_norm)
    runner = HOOF_Runner(env, model, nsteps=nsteps, gamma=gamma)
    epinfobuf = deque(maxlen=100)

    # Calculate the batch_size
    nbatch = nenvs * nsteps

    # model helper functions
    model_params = find_trainable_variables("a2c_model")
    get_flat = U.GetFlat(model_params)
    set_from_flat = U.SetFromFlat(model_params)

    def kl(new_mean, new_sd, old_mean, old_sd):
        approx_kl = np.log(new_sd / old_sd) + (
            old_sd**2 +
            (old_mean - new_mean)**2) / (2.0 * new_sd**2 + 10**-8) - 0.5
        approx_kl = np.sum(approx_kl, axis=1)
        approx_kl = np.mean(approx_kl)
        return approx_kl

    if max_kl is None:  # set max kl to a high val in case there is no constraint
        max_kl = 10**8

    # Start total timer
    tstart = time.time()

    for update in range(1, int(total_timesteps // nbatch + 1)):
        opt_pol_val = -10**8
        approx_kl = np.zeros((num_ent_coeff, num_lr))
        epv = np.zeros((num_ent_coeff, num_lr))
        rand_lr = lr_upper_bound * np.random.rand(num_lr)
        rand_lr = np.sort(rand_lr)
        rand_ent_coeff = ent_upper_bound * np.random.rand(num_ent_coeff)

        old_params = get_flat()
        rms_weights_before_upd = model.get_opt_state()

        obs, states, rewards, masks, actions, values, undisc_rwds, epinfos = runner.run(
        )
        epinfobuf.extend(epinfos)
        old_mean, old_sd, old_neg_ll = model.get_mean_std_neg_ll(obs, actions)
        for nec in range(num_ent_coeff):
            # reset policy and rms prop optimiser
            set_from_flat(old_params)
            model.set_opt_state(rms_weights_before_upd)

            # get grads for loss fn with given entropy coeff
            policy_loss, value_loss, policy_entropy = model.train(
                obs, states, rewards, masks, actions, values,
                rand_ent_coeff[nec])
            new_params = get_flat()
            ent_grads = new_params - old_params

            # enumerate over different LR
            for nlr in range(num_lr):
                new_params = old_params + rand_lr[nlr] * ent_grads
                set_from_flat(new_params)
                new_mean, new_sd, new_neg_ll = model.get_mean_std_neg_ll(
                    obs, actions)
                lik_ratio = np.exp(-new_neg_ll + old_neg_ll)
                est_pol_val = wis_estimate(nenvs, nsteps, undisc_rwds,
                                           lik_ratio)
                approx_kl[nec, nlr] = kl(new_mean, new_sd, old_mean, old_sd)
                epv[nec, nlr] = est_pol_val

                if (nec == 0
                        and nlr == 0) or (est_pol_val > opt_pol_val
                                          and approx_kl[nec, nlr] < max_kl):
                    opt_pol_val = est_pol_val
                    opt_pol_params = get_flat()
                    opt_rms_wts = model.get_opt_state()
                    opt_lr = rand_lr[nlr]
                    opt_ent_coeff = rand_ent_coeff[nec]
                    opt_kl = approx_kl[nec, nlr]

        # update policy and rms prop to optimal wts
        set_from_flat(opt_pol_params)
        model.set_opt_state(opt_rms_wts)

        # Shrink LR search space if too many get rejected
        rejections = np.sum(approx_kl > max_kl) / num_lr
        if rejections > 0.8:
            lr_upper_bound *= 0.8
        if rejections == 0:
            lr_upper_bound *= 1.25

        nseconds = time.time() - tstart

        # Calculate the fps (frame per second)
        fps = int((update * nbatch) / nseconds)
        if update % log_interval == 0 or update == 1:
            # Calculates if value function is a good predicator of the returns (ev > 1)
            # or if it's just worse than predicting nothing (ev =< 0)
            ev = explained_variance(values, rewards)
            logger.record_tabular("nupdates", update)
            logger.record_tabular("total_timesteps", update * nbatch)
            logger.record_tabular("fps", fps)
            logger.record_tabular("policy_entropy", float(policy_entropy))
            logger.record_tabular("value_loss", float(value_loss))
            logger.record_tabular("explained_variance", float(ev))
            logger.record_tabular("opt_lr", float(opt_lr))
            logger.record_tabular("ent_coeff", float(opt_ent_coeff))
            logger.record_tabular("approx_kl", float(opt_kl))
            logger.record_tabular("rejections", rejections)
            logger.record_tabular("lr_ub", lr_upper_bound)
            logger.record_tabular(
                "eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf]))
            logger.record_tabular(
                "eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf]))
            logger.dump_tabular()
    return model
예제 #28
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 total_timesteps,
                 nprocs=32,
                 nsteps=20,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 vf_fisher_coef=1.0,
                 lr=0.25,
                 max_grad_norm=0.5,
                 kfac_clip=0.001,
                 lrschedule='linear'):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=nprocs,
                                inter_op_parallelism_threads=nprocs)
        config.gpu_options.allow_growth = True
        self.sess = sess = tf.Session(config=config)
        nact = ac_space.n
        nbatch = nenvs * nsteps
        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        PG_LR = tf.placeholder(tf.float32, [])
        VF_LR = tf.placeholder(tf.float32, [])

        self.model = step_model = policy(sess,
                                         ob_space,
                                         ac_space,
                                         nenvs,
                                         1,
                                         reuse=False)
        self.model2 = train_model = policy(sess,
                                           ob_space,
                                           ac_space,
                                           nenvs * nsteps,
                                           nsteps,
                                           reuse=True)

        logpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=train_model.pi, labels=A)
        self.logits = logits = train_model.pi

        ##training loss
        pg_loss = tf.reduce_mean(ADV * logpac)
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        pg_loss = pg_loss - ent_coef * entropy
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        train_loss = pg_loss + vf_coef * vf_loss

        ##Fisher loss construction
        self.pg_fisher = pg_fisher_loss = -tf.reduce_mean(logpac)
        sample_net = train_model.vf + tf.random_normal(tf.shape(
            train_model.vf))
        self.vf_fisher = vf_fisher_loss = -vf_fisher_coef * tf.reduce_mean(
            tf.pow(train_model.vf - tf.stop_gradient(sample_net), 2))
        self.joint_fisher = joint_fisher_loss = pg_fisher_loss + vf_fisher_loss

        self.params = params = find_trainable_variables("model")

        self.grads_check = grads = tf.gradients(train_loss, params)

        with tf.device('/gpu:0'):
            self.optim = optim = kfac.KfacOptimizer(learning_rate=PG_LR, clip_kl=kfac_clip,\
                momentum=0.9, kfac_update=1, epsilon=0.01,\
                stats_decay=0.99, async=1, cold_iter=10, max_grad_norm=max_grad_norm)

            update_stats_op = optim.compute_and_apply_stats(joint_fisher_loss,
                                                            var_list=params)
            train_op, q_runner = optim.apply_gradients(list(zip(grads,
                                                                params)))
        self.q_runner = q_runner
        self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = self.lr.value()

            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                PG_LR: cur_lr
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks

            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, train_op], td_map)
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)

        self.train = train
        self.save = save
        self.load = load
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        tf.global_variables_initializer().run(session=sess)
예제 #29
0
    def __init__(self,
                 optimiser,
                 policy,
                 env,
                 nsteps,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6)):

        sess = tf_util.get_session()
        nenvs = env.num_envs
        nbatch = nenvs * nsteps

        with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE):
            # step_model is used for sampling
            step_model = policy(nenvs, 1, sess)

            # train_model is used to train our network
            train_model = policy(nbatch, nsteps, sess)

        A = tf.placeholder(train_model.action.dtype, train_model.action.shape)
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])
        Ent_Coeff = tf.placeholder(tf.float32, [])  # for Entropy

        # Calculate the loss
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Policy loss
        neglogpac = train_model.pd.neglogp(A)
        # L = A(s,a) * -logpi(a|s)
        pg_loss = tf.reduce_mean(ADV * neglogpac)

        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # Value loss
        vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R)

        loss = pg_loss - entropy * Ent_Coeff + vf_loss * vf_coef

        # Update parameters using loss
        # 1. Get the model parameters
        params = find_trainable_variables("a2c_model")

        # 2. Calculate the gradients
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        # 3. Make op for one policy and value update step of A2C
        if optimiser == 'RMSProp':
            trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                                decay=alpha,
                                                epsilon=epsilon)
        elif optimiser == 'SGD':
            trainer = tf.train.GradientDescentOptimizer(learning_rate=LR)

        _train = trainer.apply_gradients(grads)

        #https://stackoverflow.com/a/45624533
        _slot_vars = [
            trainer.get_slot(var, name) for name in trainer.get_slot_names()
            for var in params
        ]
        SLOTS = [tf.placeholder(tf.float32, slot.shape) for slot in _slot_vars]
        _set_slots = [var.assign(SLOTS[i]) for i, var in enumerate(_slot_vars)]

        def get_opt_state():
            return sess.run(_slot_vars)

        def set_opt_state(state):
            feed = {k: v for k, v in zip(SLOTS, state)}
            return sess.run(_set_slots, feed)

        def train(obs, states, rewards, masks, actions, values, ent_coeff):
            # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
            # rewards = R + yV(s')
            advs = rewards - values

            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                Ent_Coeff: ent_coeff,
                LR: 1.0
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)
            return policy_loss, value_loss, policy_entropy

        # Only this bit added
        def get_mean_std_neg_ll(obs, actions):
            td_map = {train_model.X: obs, A: actions}
            vals = sess.run(
                [train_model.pd.mean, train_model.pd.std, neglogpac], td_map)
            return vals

        self.get_mean_std_neg_ll = get_mean_std_neg_ll
        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = functools.partial(tf_util.save_variables, sess=sess)
        self.load = functools.partial(tf_util.load_variables, sess=sess)
        self.get_opt_state = get_opt_state
        self.set_opt_state = set_opt_state
        tf.global_variables_initializer().run(session=sess)
예제 #30
0
    def __init__(self,
                 policy,
                 env,
                 nsteps,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lrschedule='linear'):

        sess = tf_util.get_session()
        nenvs = env.num_envs
        nbatch = nenvs * nsteps

        with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE):
            step_model = policy(nenvs, 1, sess)
            train_model = policy(nbatch, nsteps, sess)

        A = tf.placeholder(train_model.action.dtype, train_model.action.shape)
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)
        entropy = tf.reduce_mean(train_model.pd.entropy())

        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R)

        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        params = find_trainable_variables("a2c_model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=alpha,
                                            epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()

            td_map = {
                train_model.X: obs,
                A: actions,
                ADV: advs,
                R: rewards,
                LR: cur_lr
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)
            return policy_loss, value_loss, policy_entropy

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = functools.partial(tf_util.save_variables, sess=sess)
        self.load = functools.partial(tf_util.load_variables, sess=sess)
        tf.global_variables_initializer().run(session=sess)
예제 #31
0
파일: a2c.py 프로젝트: hitersyw/baselines
    def __init__(self, policy, ob_space, ac_space, nenvs, nsteps,
            ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4,
            alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6),
                 lrschedule='linear', replay_lambda=1, ss_rate=1,
                 replay_loss=None):

        sess = tf_util.make_session()
        nact = ac_space.n
        nbatch = nenvs*nsteps

        # If we have replay_loss, create replay buffer and stage buffer
        # Use this to enforce replay loss lower
        if replay_loss is not None:
            self.replay_buffer = [] # holds all past data

        A = tf.placeholder(tf.int32, [nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
        train_model = policy(sess, ob_space, ac_space, nenvs*nsteps, nsteps, reuse=True)

        neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))

        # Introduce replay_loss if given
        if replay_loss == "L2":
            # Replace train_model.pi with whatever is predicted label
            # Replace A with whatever is recorded label
            re_loss = tf.nn.l2_loss(tf.nn.softmax(train_model.pi) - A) / nbatch
        elif replay_loss == "Distillation":
            # Replace y_donor with whatever is recorded label
            # Replace y_acceptor with whatever is predicted label
            re_loss = tf.reduce_mean( - tf.reduce_sum(tf.stop_gradient(y_donor)
                                                      * tf.log(y_acceptor),
                                                      reduction_indices=1))
        loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef
        if replay_loss is not None:
            loss = loss + replay_lambda*re_loss
        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr}
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train],
                td_map
            )
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            make_path(save_path)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
예제 #32
0
    def __init__(self, policy, env, nsteps,
            ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4,
            alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'):

        sess = tf_util.get_session()
        nenvs = env.num_envs
        nbatch = nenvs*nsteps


        with tf.variable_scope('a2c_model', reuse=tf.AUTO_REUSE):
            # step_model is used for sampling
            step_model = policy(nenvs, 1, sess)

            # train_model is used to train our network
            train_model = policy(nbatch, nsteps, sess)

        A = tf.placeholder(train_model.action.dtype, train_model.action.shape)
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        # Calculate the loss
        # Total loss = Policy gradient loss - entropy * entropy coefficient + Value coefficient * value loss

        # Policy loss
        neglogpac = train_model.pd.neglogp(A)
        # L = A(s,a) * -logpi(a|s)
        pg_loss = tf.reduce_mean(ADV * neglogpac)

        # Entropy is used to improve exploration by limiting the premature convergence to suboptimal policy.
        entropy = tf.reduce_mean(train_model.pd.entropy())

        # Value loss
        vf_loss = losses.mean_squared_error(tf.squeeze(train_model.vf), R)

        loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef

        # Update parameters using loss
        # 1. Get the model parameters
        params = find_trainable_variables("a2c_model")

        # 2. Calculate the gradients
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            # Clip the gradients (normalize)
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        # zip aggregate each gradient with parameters associated
        # For instance zip(ABCD, xyza) => Ax, By, Cz, Da

        # 3. Make op for one policy and value update step of A2C
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)

        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, rewards, masks, actions, values):
            # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
            # rewards = R + yV(s')
            advs = rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()

            td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr}
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train],
                td_map
            )
            return policy_loss, value_loss, policy_entropy


        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = functools.partial(tf_util.save_variables, sess=sess)
        self.load = functools.partial(tf_util.load_variables, sess=sess)
        tf.global_variables_initializer().run(session=sess)
예제 #33
0
    def __init__(self,
                 optim,
                 policy,
                 ob_dim,
                 ac_dim,
                 num_procs,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 vf_lr=0.001,
                 cv_lr=0.001,
                 cv_num=25,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lrschedule='linear',
                 logdir=None):
        config = tf.ConfigProto(allow_soft_placement=True,
                                intra_op_parallelism_threads=num_procs,
                                inter_op_parallelism_threads=num_procs)
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config)

        A = tf.placeholder(tf.float32, [None, ac_dim], name="A")
        ADV = tf.placeholder(tf.float32, [None], name="ADV")
        R = tf.placeholder(tf.float32, [None], name="R")

        train_model = policy(sess, ob_dim, ac_dim, vf_lr, cv_lr, reuse=False)
        step_model = policy(sess, ob_dim, ac_dim, vf_lr, cv_lr, reuse=True)

        params = find_trainable_variables("model")
        tf.summary.histogram("vf", train_model.vf)

        pi_params = [v for v in params if "pi" in v.name]
        vf_params = [v for v in params if "vf" in v.name]

        logpac = train_model.logprob_n
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        pg_loss = -tf.reduce_mean(ADV * logpac)
        tf.summary.scalar("vf_loss", vf_loss)

        if train_model.relaxed:
            ddiff_loss = tf.reduce_mean(train_model.cv)
            ddiff_grads_mean = tf.gradients(ddiff_loss, pi_params)
            ddiff_grads_std = tf.gradients(ddiff_loss, train_model.logstd_1a)

            dlogp_dmean = (A - train_model.mean) / tf.square(
                train_model.std_na)
            dlogp_dstd = -1 / train_model.std_na + 1 / tf.pow(
                train_model.std_na, 3) * tf.square(A - train_model.mean)

            pi_grads_mean = -((tf.expand_dims(ADV, 1) - train_model.cv) *
                              dlogp_dmean) / tf.to_float(tf.shape(ADV)[0])
            pg_grads_mean = tf.gradients(train_model.mean,
                                         pi_params,
                                         grad_ys=pi_grads_mean)
            pg_grads_mean = [
                pg - dg for pg, dg in zip(pg_grads_mean, ddiff_grads_mean)
            ]
            pi_grads_std = -((tf.expand_dims(ADV, 1) - train_model.cv) *
                             dlogp_dstd) / tf.to_float(tf.shape(ADV)[0])
            pg_grads_std = tf.gradients(train_model.std_na,
                                        train_model.logstd_1a,
                                        grad_ys=pi_grads_std)
            pg_grads_std = [
                pg - dg for pg, dg in zip(pg_grads_std, ddiff_grads_std)
            ]

            pg_grads = pg_grads_mean + pg_grads_std

            cv_loss = tf.concat([tf.reshape(p, [-1]) for p in pg_grads], 0)
            cv_loss = tf.squeeze(tf.reduce_sum(tf.square(cv_loss)))

            tf.summary.scalar("cv_loss", cv_loss)
            cv_params = [v for v in params if "cv" in v.name]
            cv_grads = tf.gradients(cv_loss, cv_params)
            cv_gradvars = list(zip(cv_grads, cv_params))
        else:
            pg_grads = tf.gradients(pg_loss, pi_params) + tf.gradients(
                pg_loss, train_model.logstd_1a)

        all_policy_grads = tf.concat([tf.reshape(pg, [-1]) for pg in pg_grads],
                                     0)

        # policy gradients
        policy_gradvars = list(
            zip(pg_grads, pi_params + [train_model.logstd_1a]))
        vf_grads = tf.gradients(vf_loss, vf_params)
        vf_gradvars = list(zip(vf_grads, vf_params))

        grads_list = policy_gradvars + vf_gradvars
        if train_model.relaxed: grads_list += cv_gradvars
        for g, v in grads_list:
            tf.summary.histogram(v.name, v)
            tf.summary.histogram(v.name + "_grad", g)

        sum_op = tf.summary.merge_all()
        writer = tf.summary.FileWriter(logdir)

        trainer = optim
        _train = trainer.apply_gradients(policy_gradvars)

        _vf_train = train_model.vf_optim.apply_gradients(vf_gradvars)

        self._step = 0

        def get_cv_grads(obs, old_actions, advs, rewards, vf_in, values):
            advs = rewards - values
            td_map = {
                train_model.ob_no: obs,
                train_model.oldac_na: old_actions,
                train_model.X: vf_in,
                A: old_actions,
                ADV: advs,
                R: rewards
            }
            cv_gs = sess.run(cv_grads, td_map)
            return cv_gs

        def update_cv(mean_cv_gs):
            cv_gvs = list(zip(mean_cv_gs, cv_params))
            train_model.cv_optim.apply_gradients(cv_gvs)

        def update_policy_and_value(obs,
                                    old_actions,
                                    advs,
                                    rewards,
                                    vf_in,
                                    values,
                                    summary=False):
            advs = rewards - values
            td_map = {
                train_model.ob_no: obs,
                train_model.oldac_na: old_actions,
                train_model.X: vf_in,
                A: old_actions,
                ADV: advs,
                R: rewards
            }
            for _ in range(25):
                sess.run(
                    _vf_train, {
                        train_model.ob_no: obs,
                        train_model.oldac_na: old_actions,
                        train_model.X: vf_in,
                        A: old_actions,
                        ADV: advs,
                        R: rewards
                    })
            if summary:
                sum_str, policy_loss, value_loss, _, = sess.run(
                    [sum_op, pg_loss, vf_loss, _train], td_map)
                writer.add_summary(sum_str, self._step)
            else:
                policy_loss, value_loss, _ = sess.run(
                    [pg_loss, vf_loss, _train], td_map)
            self._step += 1
            return policy_loss, value_loss

        def get_grads(obs, old_actions, advs, rewards, vf_in, value):
            advs = rewards - value
            td_map = {
                train_model.ob_no: obs,
                train_model.oldac_na: old_actions,
                train_model.X: vf_in,
                A: old_actions,
                ADV: advs,
                R: rewards
            }
            _g = all_policy_grads  # seems to already have happened? / tf.to_float(tf.shape(rewards)[0])
            pg = sess.run(_g, td_map)
            return pg

        def save(save_path):
            ps = sess.run(params)
            make_path(save_path)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.sess = sess
        self.get_cv_grads = get_cv_grads
        self.update_cv = update_cv
        self.update_policy_and_value = update_policy_and_value
        self.train_model = train_model
        self.step_model = step_model
        self.value = train_model.value
        self.get_grads = get_grads
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
예제 #34
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 nsteps,
                 ent_coef=0.01,
                 vf_coef=0.5,
                 max_grad_norm=0.5,
                 lr=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(20e6),
                 lrschedule='linear'):

        sess = tf.get_default_session()
        nbatch = nenvs * nsteps

        step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
        train_model = policy(sess,
                             ob_space,
                             ac_space,
                             nenvs * nsteps,
                             nsteps,
                             reuse=True)

        A = train_model.pdtype.sample_placeholder([nbatch])
        ADV = tf.placeholder(tf.float32, [nbatch])
        R = tf.placeholder(tf.float32, [nbatch])
        LR = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)
        pg_loss = tf.reduce_mean(ADV * neglogpac)
        vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
        entropy = tf.reduce_mean(train_model.pd.entropy())
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

        params = find_trainable_variables("model")
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.RMSPropOptimizer(learning_rate=LR,
                                            decay=alpha,
                                            epsilon=epsilon)
        _train = trainer.apply_gradients(grads)

        lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

        def train(obs, states, discounted_rewards, rewards, masks,
                  prev_actions, actions, values, dones):
            advs = discounted_rewards - values
            for step in range(len(obs)):
                cur_lr = lr.value()
            # reshape actions, rewards, and dones to have first dimension of size nenvs*nsteps, existing second dimension
            # this is already done for obs
            rews = np.reshape(rewards, (nbatch, 1))
            ds = np.reshape(np.asarray(dones, dtype=np.float32), (nbatch, 1))
            if len(ac_space.shape) == 0:
                prev_actions = np.reshape(prev_actions, (nbatch, ))
                one_hot = np.eye(ac_space.n)[prev_actions]
                for i in range(nbatch):
                    if prev_actions[i] == -1:
                        one_hot[i, :] = np.zeros((ac_space.n, ), dtype=np.int)
                x = np.concatenate((obs, one_hot, rews, ds), axis=1)
                actions = np.reshape(actions, (nbatch, ))
            else:
                prev_actions = np.reshape(prev_actions,
                                          (nbatch, ac_space.shape[0]))
                x = np.concatenate((obs, prev_actions, rews, ds), axis=1)
            td_map = {
                train_model.X: x,
                A: actions,
                ADV: advs,
                R: discounted_rewards,
                LR: cur_lr
            }
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            policy_loss, value_loss, policy_entropy, _ = sess.run(
                [pg_loss, vf_loss, entropy, _train], td_map)
            return policy_loss, value_loss, policy_entropy

        def save(save_path):
            ps = sess.run(params)
            make_path(osp.dirname(save_path))
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.initial_state = step_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
예제 #35
0
  def __init__(self,
               policy,
               ob_space,
               ac_space,
               nenvs,
               total_timesteps,
               nprocs=32,
               nscripts=16,
               nsteps=20,
               nstack=4,
               ent_coef=0.1,
               vf_coef=0.5,
               vf_fisher_coef=1.0,
               lr=0.25,
               max_grad_norm=0.001,
               kfac_clip=0.001,
               lrschedule='linear',
               alpha=0.99,
               epsilon=1e-5):
    config = tf.ConfigProto(
        allow_soft_placement=True,
        intra_op_parallelism_threads=nprocs,
        inter_op_parallelism_threads=nprocs)
    config.gpu_options.allow_growth = True
    self.sess = sess = tf.Session(config=config)
    nsml.bind(sess=sess)
    #nact = ac_space.n
    nbatch = nenvs * nsteps
    A = tf.placeholder(tf.int32, [nbatch])

    XY0 = tf.placeholder(tf.int32, [nbatch])
    XY1 = tf.placeholder(tf.int32, [nbatch])

    # ADV == TD_TARGET - values
    ADV = tf.placeholder(tf.float32, [nbatch])
    TD_TARGET = tf.placeholder(tf.float32, [nbatch])
    PG_LR = tf.placeholder(tf.float32, [])
    VF_LR = tf.placeholder(tf.float32, [])

    self.model = step_model = policy(
        sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
    self.model2 = train_model = policy(
        sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True)

    # Policy 1 : Base Action : train_model.pi label = A

    script_mask = tf.concat(
        [
            tf.zeros([nscripts * nsteps, 1]),
            tf.ones([(nprocs - nscripts) * nsteps, 1])
        ],
        axis=0)

    pi = train_model.pi
    pac_weight = script_mask * (tf.nn.softmax(pi) - 1.0) + 1.0
    pac_weight = tf.reduce_sum(pac_weight * tf.one_hot(A, depth=3), axis=1)
    neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=pi, labels=A)
    neglogpac *= tf.stop_gradient(pac_weight)

    inv_A = 1.0 - tf.cast(A, tf.float32)

    xy0_mask = tf.cast(A, tf.float32)
    xy1_mask = tf.cast(A, tf.float32)

    condition0 = tf.equal(xy0_mask, 2)
    xy0_mask = tf.where(condition0, tf.ones(tf.shape(xy0_mask)), xy0_mask)
    xy0_mask = 1.0 - xy0_mask

    condition1 = tf.equal(xy1_mask, 2)
    xy1_mask = tf.where(condition1, tf.zeros(tf.shape(xy1_mask)), xy1_mask)

    # One hot representation of chosen marine.
    # [batch_size, 2]
    pi_xy0 = train_model.pi_xy0
    pac_weight = script_mask * (tf.nn.softmax(pi_xy0) - 1.0) + 1.0
    pac_weight = tf.reduce_sum(
        pac_weight * tf.one_hot(XY0, depth=1024), axis=1)

    logpac_xy0 = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=pi_xy0, labels=XY0)
    logpac_xy0 *= tf.stop_gradient(pac_weight)
    logpac_xy0 *= tf.cast(xy0_mask, tf.float32)

    pi_xy1 = train_model.pi_xy1
    pac_weight = script_mask * (tf.nn.softmax(pi_xy1) - 1.0) + 1.0
    pac_weight = tf.reduce_sum(
        pac_weight * tf.one_hot(XY0, depth=1024), axis=1)

    # 1D? 2D?
    logpac_xy1 = tf.nn.sparse_softmax_cross_entropy_with_logits(
        logits=pi_xy1, labels=XY1)
    logpac_xy1 *= tf.stop_gradient(pac_weight)
    logpac_xy1 *= tf.cast(xy1_mask, tf.float32)

    pg_loss = tf.reduce_mean(ADV * neglogpac)
    pg_loss_xy0 = tf.reduce_mean(ADV * logpac_xy0)
    pg_loss_xy1 = tf.reduce_mean(ADV * logpac_xy1)

    vf_ = tf.squeeze(train_model.vf)

    vf_r = tf.concat(
        [
            tf.ones([nscripts * nsteps, 1]),
            tf.zeros([(nprocs - nscripts) * nsteps, 1])
        ],
        axis=0) * TD_TARGET
    vf_masked = vf_ * script_mask + vf_r

    #vf_mask[0:nscripts * nsteps] = R[0:nscripts * nsteps]

    vf_loss = tf.reduce_mean(mse(vf_masked, TD_TARGET))
    entropy_a = tf.reduce_mean(cat_entropy(train_model.pi))
    entropy_xy0 = tf.reduce_mean(cat_entropy(train_model.pi_xy0))
    entropy_xy1 = tf.reduce_mean(cat_entropy(train_model.pi_xy1))
    entropy = entropy_a + entropy_xy0 + entropy_xy1

    loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef

    params = find_trainable_variables("model")
    grads = tf.gradients(loss, params)
    if max_grad_norm is not None:
      grads, _ = tf.clip_by_global_norm(grads, max_grad_norm)
    grads = list(zip(grads, params))
    trainer = tf.train.RMSPropOptimizer(
        learning_rate=lr, decay=alpha, epsilon=epsilon)
    _train = trainer.apply_gradients(grads)

    self.logits = logits = train_model.pi

    # xy0

    self.params_common = params_common = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/common')
    self.params_xy0 = params_xy0 = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES,
        scope='model/xy0') + params_common

    train_loss_xy0 = pg_loss_xy0 - entropy * ent_coef + vf_coef * vf_loss

    self.grads_check_xy0 = grads_xy0 = tf.gradients(
        train_loss_xy0, params_xy0)
    if max_grad_norm is not None:
      grads_xy0, _ = tf.clip_by_global_norm(grads_xy0, max_grad_norm)

    grads_xy0 = list(zip(grads_xy0, params_xy0))
    trainer_xy0 = tf.train.RMSPropOptimizer(
        learning_rate=lr, decay=alpha, epsilon=epsilon)
    _train_xy0 = trainer_xy0.apply_gradients(grads_xy0)

    # xy1

    self.params_xy1 = params_xy1 = tf.get_collection(
        tf.GraphKeys.TRAINABLE_VARIABLES,
        scope='model/xy1') + params_common

    train_loss_xy1 = pg_loss_xy1 - entropy * ent_coef + vf_coef * vf_loss

    self.grads_check_xy1 = grads_xy1 = tf.gradients(
        train_loss_xy1, params_xy1)
    if max_grad_norm is not None:
      grads_xy1, _ = tf.clip_by_global_norm(grads_xy1, max_grad_norm)

    grads_xy1 = list(zip(grads_xy1, params_xy1))
    trainer_xy1 = tf.train.RMSPropOptimizer(
        learning_rate=lr, decay=alpha, epsilon=epsilon)
    _train_xy1 = trainer_xy1.apply_gradients(grads_xy1)

    self.lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)

    def train(obs, states, td_targets, masks, actions, xy0, xy1, values):
      advs = td_targets - values
      for step in range(len(obs)):
        cur_lr = self.lr.value()

      td_map = {
          train_model.X: obs,
          A: actions,
          XY0: xy0,
          XY1: xy1,
          ADV: advs,
          TD_TARGET: td_targets,
          PG_LR: cur_lr
      }
      if states != []:
        td_map[train_model.S] = states
        td_map[train_model.M] = masks

      policy_loss, value_loss, policy_entropy, _, \
      policy_loss_xy0, policy_entropy_xy0, _, \
      policy_loss_xy1, policy_entropy_xy1, _ = sess.run(
          [pg_loss, vf_loss, entropy, _train,
           pg_loss_xy0, entropy_xy0, _train_xy0,
           pg_loss_xy1, entropy_xy1, _train_xy1],
          td_map)
      return policy_loss, value_loss, policy_entropy, \
             policy_loss_xy0, policy_entropy_xy0, \
             policy_loss_xy1, policy_entropy_xy1

    def save(save_path):
      ps = sess.run(params)
      joblib.dump(ps, save_path)

    def load(load_path):
      loaded_params = joblib.load(load_path)
      restores = []
      for p, loaded_p in zip(params, loaded_params):
        restores.append(p.assign(loaded_p))
      sess.run(restores)

    self.train = train
    self.save = save
    self.load = load
    self.train_model = train_model
    self.step_model = step_model
    self.step = step_model.step
    self.value = step_model.value
    self.initial_state = step_model.initial_state
    print("global_variables_initializer start")
    tf.global_variables_initializer().run(session=sess)
    print("global_variables_initializer complete")